#define _GNU_SOURCE #include #include #include #include #include #include #include #include /* Old glibc (< 2.3.4) does not provide this constant. We use syscall * directly so this definition is safe. */ #ifndef CLOCK_MONOTONIC #define CLOCK_MONOTONIC 1 #endif /* libc has incredibly messy way of doing this, * typically requiring -lrt. We just skip all this mess */ static void get_mono(struct timespec *ts) { syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts); } void memset_rep_stosq(void *ptr, unsigned long cnt) { unsigned long ax,cx,di; asm volatile( "rep stosq" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt), "2" (0) : "memory" ); } void memset_movnti(void *ptr, unsigned long cnt) { unsigned long ax,cx,di; asm volatile( "1: movnti %%rax,(%%rdi)\n" "add $8,%%rdi\n" "dec %%rcx\n" "jnz 1b\n" "sfence\n" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt), "2" (0) : "memory" ); } void memset_movnti_unroll(void *ptr, unsigned long cnt) { unsigned long ax,cx,di; asm volatile( "1:\n" "movnti %%rax,(%%rdi)\n" "movnti %%rax,8(%%rdi)\n" "movnti %%rax,16(%%rdi)\n" "movnti %%rax,24(%%rdi)\n" "add $32,%%rdi\n" "dec %%rcx\n" "jnz 1b\n" "sfence\n" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt/4), "2" (0) : "memory" ); } unsigned gett() { #if 0 struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_usec; #else struct timespec ts; get_mono(&ts); return ts.tv_nsec; #endif } unsigned difft(unsigned t2, unsigned t1) { t2 -= t1; if ((int)t2 < 0) t2 += 1000000000; return t2; } #define BUF (50*1024) #define BUF8 (BUF/8) void measure(void *buf, void (*m)(void *ptr, unsigned long cnt), const char *name) { unsigned t1, t2, cnt; sleep(1); m(buf, BUF8); t2 = -1U; cnt = 1000; while (--cnt) { t1 = gett(); #define REPEAT 32 m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); m(buf, BUF8);m(buf, BUF8);m(buf, BUF8);m(buf, BUF8); t1 = difft(gett(), t1); if (t2 > t1) t2 = t1; // printf("%s:%u ns %u\n", name, t1, t2); } printf("%s:%u ns (times %d), %.6f bytes/ns\n", name, t2, REPEAT, (double)(BUF) * REPEAT / t2); } int main() { char *buf = malloc(8*BUF + 4096); buf += 0x100; buf = (char*)((long)buf & ~0xffL); printf("size:%uk buf:%p\n", BUF/1024, buf); // measure(buf, memset_movnti, "movnti"); // measure(buf, memset_movnti_unroll, "movnti_unroll"); measure(buf, memset_rep_stosq, "stos"); // measure(buf+1, memset_movnti, "movnti+1"); // measure(buf+1, memset_movnti_unroll, "movnti_unroll+1"); measure(buf+1, memset_rep_stosq, "stos+1"); // measure(buf+3, memset_movnti, "movnti+3"); // measure(buf+3, memset_movnti_unroll, "movnti_unroll+3"); measure(buf+4, memset_rep_stosq, "stos+4"); measure(buf+8, memset_rep_stosq, "stos+8"); return 0; }