Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 12 Oct 2015 20:30:34 +0200
From: Denys Vlasenko <vda.linux@...glemail.com>
To: Rich Felker <dalias@...c.org>
Cc: Denys Vlasenko <vda.linux@...glemail.com>, musl@...ts.openwall.com
Subject: [PATCH 3/3] i386/memset: move byte-extending IMUL up, drop one insn

"mov %dl,%dh" may have a register stall (even though rumor has it
newest intel chips do double accounting for low byte regs,
on such CPUs subsequent "mov %dl,(%eax)" won't wait for nex value
of %edx).
Anyway, if we move IMUL up, we can drop it altogether.

   text	   data	    bss	    dec	    hex	filename
    177	      0	      0	    177	     b1	memset2.o
    175	      0	      0	    175	     af	memset3.o

Add the bench program. No need for everybody to reinvent it every time.

Comparison of memset before/after last three patches on Sandy Bridge CPU:
big blocks unaffected, small blocks:

               Before           After
50 byte block: 6.31 bytes/ns    6.56
49 byte block: 6.19 bytes/ns    6.43
48 byte block: 6.06 bytes/ns    6.30
47 byte block: 5.94 bytes/ns    6.17
46 byte block: 5.81 bytes/ns    6.04
45 byte block: 5.69 bytes/ns    5.91
44 byte block: 5.56 bytes/ns    5.78
43 byte block: 5.44 bytes/ns    5.65
42 byte block: 5.31 bytes/ns    5.52
41 byte block: 5.19 bytes/ns    5.39
40 byte block: 5.06 bytes/ns    5.26
39 byte block: 4.94 bytes/ns    5.13
38 byte block: 4.81 bytes/ns    5.00
37 byte block: 4.69 bytes/ns    4.87
36 byte block: 4.56 bytes/ns    4.74
35 byte block: 4.44 bytes/ns    4.61
34 byte block: 4.31 bytes/ns    4.48
33 byte block: 4.19 bytes/ns    4.35
32 byte block: 4.06 bytes/ns    4.22
31 byte block: 3.93 bytes/ns    4.09
30 byte block: 5.71 bytes/ns    6.23
29 byte block: 5.52 bytes/ns    6.03
28 byte block: 5.33 bytes/ns    5.82
27 byte block: 5.15 bytes/ns    5.62
26 byte block: 4.95 bytes/ns    5.41
25 byte block: 4.77 bytes/ns    5.21
24 byte block: 4.58 bytes/ns    5.00
23 byte block: 4.39 bytes/ns    4.79
22 byte block: 4.20 bytes/ns    4.59
21 byte block: 4.01 bytes/ns    4.38
20 byte block: 3.82 bytes/ns    4.17
19 byte block: 3.63 bytes/ns    3.97
18 byte block: 3.44 bytes/ns    3.76
17 byte block: 3.25 bytes/ns    3.55
16 byte block: 3.06 bytes/ns    3.34
15 byte block: 2.87 bytes/ns    3.14
14 byte block: 3.61 bytes/ns    4.05
13 byte block: 3.35 bytes/ns    3.76
12 byte block: 3.09 bytes/ns    3.47
11 byte block: 2.84 bytes/ns    3.19
10 byte block: 2.58 bytes/ns    2.90
9 byte block:  2.33 bytes/ns    2.61
8 byte block:  2.07 bytes/ns    2.32
7 byte block:  1.81 bytes/ns    2.03
6 byte block:  1.84 bytes/ns    1.84
5 byte block:  1.54 bytes/ns    1.54
4 byte block:  1.23 bytes/ns    1.23
3 byte block:  0.92 bytes/ns    0.92
2 byte block:  0.68 bytes/ns    0.62
1 byte block:  0.34 bytes/ns    0.31

Signed-off-by: Denys Vlasenko <vda.linux@...glemail.com>
CC: Rich Felker <dalias@...c.org>
CC: musl@...ts.openwall.com
---
 src/string/i386/memset.s       |  4 +-
 src/string/i386/memset_bench.c | 92 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 src/string/i386/memset_bench.c

diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s
index cd13f41..622a1f0 100644
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -10,16 +10,14 @@ memset:
 	test %ecx,%ecx
 	jz 1f
 
-	mov %dl,%dh
-
 	mov %dl,(%eax)
 	mov %dl,-1(%eax,%ecx)
+	imul $0x1010101,%edx
 	cmp $2,%ecx
 	jbe 1f
 
 	mov %dx,1(%eax)
 	mov %dx,(-1-2)(%eax,%ecx)
-	imul $0x10001,%edx
 	cmp $6,%ecx
 	jbe 1f
 
diff --git a/src/string/i386/memset_bench.c b/src/string/i386/memset_bench.c
new file mode 100644
index 0000000..2072bbe
--- /dev/null
+++ b/src/string/i386/memset_bench.c
@@ -0,0 +1,92 @@
+// Build a-la
+// gcc -m32 -static -O2 -Wall memset_bench.c memset.s
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#define FILL 0
+
+/* libc has incredibly messy way of doing this,
+ * typically requiring -lrt. We just skip all this mess */
+#ifndef CLOCK_MONOTONIC
+#define CLOCK_MONOTONIC 1
+#endif
+static void get_mono(struct timespec *ts)
+{
+        syscall(__NR_clock_gettime, CLOCK_MONOTONIC, ts);
+}
+
+unsigned gett()
+{
+#if 0
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_usec;
+#else
+	struct timespec ts;
+	get_mono(&ts);
+	return ts.tv_nsec;
+#endif
+}
+
+unsigned difft(unsigned t2, unsigned t1)
+{
+	t2 -= t1;
+	if ((int)t2 < 0)
+		t2 += 1000000000;
+	return t2;
+}
+
+void measure(unsigned sz, void *buf, void* (*m)(void *ptr, int c, size_t cnt))
+{
+	unsigned t1, t2, cnt;
+	unsigned repeat = 1;
+
+	/* For small sizes, call m() many times before measuring time diff */
+	repeat = ((256*1024) / (sz|1)) ? : 1;
+
+	m(buf, FILL, sz); /* warm up caches */
+	m(buf, FILL, sz); /* warm up caches */
+
+	t2 = -1U;
+	cnt = 1000;
+	while (--cnt) {
+		unsigned rep = repeat;
+
+		t1 = gett();
+		do {
+			m(buf, FILL, sz);
+		} while (--rep);
+		t1 = difft(gett(), t1);
+		if (t2 > t1)
+			t2 = t1;
+	}
+	printf("%u byte block: %.2f bytes/ns\n", sz, (double)(sz) * repeat / t2);
+}
+
+int main(int argc, char **argv)
+{
+	int sz;
+	char *buf;
+
+	sz = argv[1] ? atoi(argv[1]) : 1024;
+	buf = malloc(sz + 4096);
+	buf += 0x100;
+	buf = (char*)((long)buf & ~0xffL);
+
+	setlinebuf(stdout);
+	printf("size:%u (%uk) buf:%p\n", sz, sz/1024, buf);
+
+	do {
+		measure(sz, buf, memset);
+	} while (--sz > 0);
+
+	return 0;
+}
-- 
1.8.1.4

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.