#define _GNU_SOURCE #include #include #include #include #include #include #include #include /* Old glibc (< 2.3.4) does not provide this constant. We use syscall * directly so this definition is safe. */ #ifndef CLOCK_MONOTONIC #define CLOCK_MONOTONIC 1 #endif #define BUF (2*1024) #define FILL 0 /* libc has incredibly messy way of doing this, * typically requiring -lrt. We just skip all this mess */ static void get_mono(struct timespec *ts) { syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts); } //void *musl_memset(void *s, int c, size_t n); void *memset_rep_stosq(void *ptr, int c, size_t cnt) { unsigned long ax,cx,di; asm volatile( "rep stosq" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt/8), "2" (0) : "memory" ); return ptr; } void *memset_movnti(void *ptr, int c, size_t cnt) { unsigned long ax,cx,di; asm volatile( "1: movnti %%rax,(%%rdi)\n" "add $8,%%rdi\n" "dec %%rcx\n" "jnz 1b\n" "sfence\n" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt/8), "2" (0) : "memory" ); return ptr; } void *memset_movnti_unroll(void *ptr, int c, size_t cnt) { unsigned long ax,cx,di; asm volatile( "1:\n" "movnti %%rax,(%%rdi)\n" "movnti %%rax,8(%%rdi)\n" "movnti %%rax,16(%%rdi)\n" "movnti %%rax,24(%%rdi)\n" "add $32,%%rdi\n" "dec %%rcx\n" "jnz 1b\n" "sfence\n" : "=D" (di), "=c" (cx), "=a" (ax) : "0" (ptr), "1" (cnt/(8*4)), "2" (0) : "memory" ); return ptr; } unsigned gett() { #if 0 struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_usec; #else struct timespec ts; get_mono(&ts); return ts.tv_nsec; #endif } unsigned difft(unsigned t2, unsigned t1) { t2 -= t1; if ((int)t2 < 0) t2 += 1000000000; return t2; } void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt), const char *name) { unsigned t1, t2, cnt; unsigned repeat = 1; /* For small sizes, call m() repeatedly before measuring time diff */ repeat = ((256*1024) / (sz|1)) ? : 1; // sleep(1); m(buf, FILL, sz); /* warm up caches */ m(buf, FILL, sz); /* warm up caches */ t2 = -1U; cnt = 1000; while (--cnt) { unsigned rep = repeat; t1 = gett(); do { m(buf, FILL, sz); } while (--rep); t1 = difft(gett(), t1); if (t2 > t1) t2 = t1; // printf("%s:%u ns %u\n", name, t1, t2); } // printf("%s:%u ns (times %d), %u bytes, %.2f bytes/ns\n", name, t2, repeat, sz, (double)(sz) * repeat / t2); printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2); } int main() { int sz; char *buf = malloc(BUF + 4096); buf += 0x100; buf = (char*)((long)buf & ~0xffL); setlinebuf(stdout); printf("size:%u (%uk) buf:%p\n", BUF, BUF/1024, buf); sz = BUF; do { measure(sz, buf, memset, "musl"); // measure(sz, buf+1, memset, "musL"); } while (--sz >= 0); // measure(buf, memset_movnti, "movnti"); // measure(buf, memset_movnti_unroll, "movnti_unroll"); // measure(buf, memset_rep_stosq, "stos"); // measure(buf+1, memset_movnti, "movnti+1"); // measure(buf+1, memset_movnti_unroll, "movnti_unroll+1"); // measure(buf+1, memset_rep_stosq, "stos+1"); // measure(buf+3, memset_movnti, "movnti+3"); // measure(buf+3, memset_movnti_unroll, "movnti_unroll+3"); // measure(buf+4, memset_rep_stosq, "stos+4"); // measure(buf+8, memset_rep_stosq, "stos+8"); return 0; }