Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Fri, 13 Feb 2015 17:39:49 +0100
From: Denys Vlasenko <vda.linux@...glemail.com>
To: musl@...ts.openwall.com, Rich Felker <dalias@...c.org>
Cc: Denys Vlasenko <vda.linux@...glemail.com>
Subject: [PATCH] x86_64/memset: use "small block" code for blocks up to 30 bytes long

Before this change, we were using it only for 15-byte blocks and smaller.
Measurements on Sandy Bridge CPU show that "rep stosq" setup time
is high enough to dominate speed of fills well above that size:

31 byte block: 3.279282 bytes/ns
30 byte block: 3.173499 bytes/ns
..
20 byte block: 2.116552 bytes/ns
..
16 byte block: 1.799337 bytes/ns
15 byte block: 5.074332 bytes/ns
14 byte block: 4.736135 bytes/ns
13 byte block: 4.398852 bytes/ns
12 byte block: 4.060479 bytes/ns
11 byte block: 3.723065 bytes/ns
10 byte block: 3.384556 bytes/ns
 9 byte block: 2.867677 bytes/ns
 8 byte block: 2.257382 bytes/ns
 7 byte block: 1.975605 bytes/ns
 6 byte block: 1.693388 bytes/ns
 5 byte block: 1.411434 bytes/ns
 4 byte block: 1.129147 bytes/ns
 3 byte block: 0.847030 bytes/ns
 2 byte block: 0.616008 bytes/ns
 1 byte block: 0.308069 bytes/ns

The patch does not increase the number of branches, but is able to handle
blocks up to 30 bytes. After the patch, timings are:

32 byte block: 3.384681 bytes/ns
31 byte block: 3.279118 bytes/ns
30 byte block: 10.128968 bytes/ns
29 byte block: 9.793798 bytes/ns
28 byte block: 9.456081 bytes/ns
27 byte block: 9.120555 bytes/ns
26 byte block: 8.782757 bytes/ns
25 byte block: 8.446654 bytes/ns
24 byte block: 8.109310 bytes/ns
23 byte block: 7.773063 bytes/ns
22 byte block: 7.434663 bytes/ns
21 byte block: 7.098760 bytes/ns
20 byte block: 6.760724 bytes/ns
19 byte block: 6.424286 bytes/ns
18 byte block: 6.086166 bytes/ns
17 byte block: 5.749441 bytes/ns
16 byte block: 5.411120 bytes/ns
15 byte block: 5.074234 bytes/ns
14 byte block: 3.947913 bytes/ns
13 byte block: 3.666643 bytes/ns
12 byte block: 3.384641 bytes/ns
11 byte block: 3.103178 bytes/ns
10 byte block: 2.821105 bytes/ns
 9 byte block: 2.539481 bytes/ns
 8 byte block: 2.257338 bytes/ns
 7 byte block: 1.975530 bytes/ns
 6 byte block: 1.693337 bytes/ns
 5 byte block: 1.411388 bytes/ns
 4 byte block: 1.129111 bytes/ns
 3 byte block: 0.846994 bytes/ns
 2 byte block: 0.615982 bytes/ns
 1 byte block: 0.308056 bytes/ns

Signed-off-by: Denys Vlasenko <vda.linux@...glemail.com>
---
 src/string/x86_64/memset.s | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
index ea61687..81adbb2 100644
--- a/src/string/x86_64/memset.s
+++ b/src/string/x86_64/memset.s
@@ -2,13 +2,13 @@
 .type memset,@function
 memset:
 	movzbq %sil,%rax
-	cmp $16,%rdx
-	jb .Less_than_16
-
 	test %esi,%esi
 	jnz .L_widen_rax  # unlikely
 .L_widened:
 
+	cmp $31,%rdx
+	jb .Less_than_31
+
 	mov %rdi,%r8
 
 	test $7,%dil
@@ -43,7 +43,7 @@ memset:
 	jmp .L_aligned
 
 
-.Less_than_16:
+.Less_than_31:
 	test %edx,%edx
 	jz .L_ret
 
@@ -52,20 +52,18 @@ memset:
 	cmp $2,%edx
 	jbe .L_ret
 
-	mov %al,1(%rdi)
-	mov %al,-2(%rdi,%rdx)
-	# 32-bit imul has 3-4 cycles latency
-	imul $0x1010101,%eax
-	cmp $4,%edx
+	mov %ax,1(%rdi)
+	mov %ax,(-1-2)(%rdi,%rdx)
+	cmp $6,%edx
 	jbe .L_ret
 
-	mov %eax,(%rdi)
-	mov %eax,-4(%rdi,%rdx)
-	cmp $8,%edx
+	mov %eax,(1+2)(%rdi)
+	mov %eax,(-1-2-4)(%rdi,%rdx)
+	cmp $14,%edx
 	jbe .L_ret
 
-	mov %eax,4(%rdi)
-	mov %eax,-8(%rdi,%rdx)
+	mov %rax,(1+2+4)(%rdi)
+	mov %rax,(-1-2-4-8)(%rdi,%rdx)
 .L_ret:
 	mov %rdi,%rax
 	ret
-- 
1.8.1.4

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.