Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 12 Feb 2015 18:17:03 +0100
From: Denys Vlasenko <vda.linux@...glemail.com>
To: musl@...ts.openwall.com, Rich Felker <dalias@...c.org>
Cc: Denys Vlasenko <vda.linux@...glemail.com>
Subject: [PATCH 2/2] x86_64/memset: align destination to 8 byte boundary

8-byte alignment gives ~25% speedup on "rep stosq" memsets
to L1 cache, compared to intentionally misaligned ones.
It is a smaller win of ~15% on larger memsets to L2 too.
Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz)

Signed-off-by: Denys Vlasenko <vda.linux@...glemail.com>
---
 src/string/x86_64/memset.s | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
index 523caa0..5c9e333 100644
--- a/src/string/x86_64/memset.s
+++ b/src/string/x86_64/memset.s
@@ -4,16 +4,23 @@ memset:
 	movzbq %sil,%rax
 	cmp $16,%rdx
 	jb .Less_than_16
+
 	test %esi,%esi
 	jnz .L_widen_rax  # unlikely
 .L_widened:
 
-	lea -1(%rdx),%rcx
 	mov %rdi,%r8
+
+	test $7,%dil
+	jnz .L_align  # unlikely
+.L_aligned:
+
+	lea -1(%rdx),%rcx
 	shr $3,%rcx
 	mov %rax,-8(%rdi,%rdx)
 	rep
 	stosq
+
 	mov %r8,%rax
 	ret
 
@@ -23,6 +30,19 @@ memset:
 	imul %rsi,%rax
 	jmp .L_widened
 
+# 8-byte alignment gives ~25% speedup on "rep stosq" memsets
+# to L1 cache, compared to intentionally misaligned ones.
+# It is a smaller win of ~15% on larger memsets to L2 too.
+# Measured on Intel Sandy Bridge CPU (i7-2620M, 2.70GHz)
+.L_align:
+	mov %rax,(%rdi)
+1:	inc %rdi
+	dec %rdx
+	test $7,%dil
+	jnz 1b
+	jmp .L_aligned
+
+
 .Less_than_16:
 	test %edx,%edx
 	jz .L_ret
-- 
1.8.1.4

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.