Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Wed, 21 Mar 2012 09:47:39 +0400
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: left shift by 1 via ADD

magnum -

There are some rotates by 1 bit in SHA-1.  When we don't have XOP, those
involve left shifts by 1 bit (as well as right shifts and ORs).  The
left shifts may be replaced by ADDs, which are 1 byte shorter.  (I am
already using this trick in the DES key setup.)  The attached patch
reduces the sse-intrinsics.o .text size by 128 bytes here (relative to
the version with MD5 optimizations).  The reduced code size might result
in a speedup on CPUs with caching of x86 instructions (as opposed to
micro-ops) - on Atom maybe?  On E5420, there's no difference.

Alexander

--- sse-intrinsics.c-md5opt	2012-03-21 04:51:56 +0000
+++ sse-intrinsics.c	2012-03-21 05:33:39 +0000
@@ -17,17 +17,19 @@
 #include "MD5_std.h"
 
 #ifndef __XOP__
+#define _mm_slli_epi32a(a, s) \
+	((s) == 1 ? _mm_add_epi32((a), (a)) : _mm_slli_epi32((a), (s)))
 #ifdef __SSSE3__
 #include <tmmintrin.h>
 #define rot16_mask _mm_set_epi64x(0x0d0c0f0e09080b0aL, 0x0504070601000302UL)
 #define _mm_roti_epi32(a, s) \
 	((s) == 16 ? _mm_shuffle_epi8((a), rot16_mask) : \
-	_mm_or_si128(_mm_slli_epi32((a), (s)), _mm_srli_epi32((a), 32-(s))))
+	_mm_or_si128(_mm_slli_epi32a((a), (s)), _mm_srli_epi32((a), 32-(s))))
 #else
 #define _mm_roti_epi32(a, s) \
 	((s) == 16 ? \
 	_mm_shufflelo_epi16(_mm_shufflehi_epi16((a), 0xb1), 0xb1) : \
-	_mm_or_si128(_mm_slli_epi32((a), (s)), _mm_srli_epi32((a), 32-(s))))
+	_mm_or_si128(_mm_slli_epi32a((a), (s)), _mm_srli_epi32((a), 32-(s))))
 #endif
 #endif
 

diff --git a/src/sse-intrinsics.c b/src/sse-intrinsics.c
index 9cb301a..190af91 100644
--- a/src/sse-intrinsics.c
+++ b/src/sse-intrinsics.c
@@ -17,8 +17,20 @@
 #include "MD5_std.h"
 
 #ifndef __XOP__
+#define _mm_slli_epi32a(a, s) \
+	((s) == 1 ? _mm_add_epi32((a), (a)) : _mm_slli_epi32((a), (s)))
+#ifdef __SSSE3__
+#include <tmmintrin.h>
+#define rot16_mask _mm_set_epi64x(0x0d0c0f0e09080b0aL, 0x0504070601000302UL)
 #define _mm_roti_epi32(a, s) \
-	_mm_or_si128(_mm_slli_epi32((a), (s)), _mm_srli_epi32((a), 32-(s)))
+	((s) == 16 ? _mm_shuffle_epi8((a), rot16_mask) : \
+	_mm_or_si128(_mm_slli_epi32a((a), (s)), _mm_srli_epi32((a), 32-(s))))
+#else
+#define _mm_roti_epi32(a, s) \
+	((s) == 16 ? \
+	_mm_shufflelo_epi16(_mm_shufflehi_epi16((a), 0xb1), 0xb1) : \
+	_mm_or_si128(_mm_slli_epi32a((a), (s)), _mm_srli_epi32((a), 32-(s))))
+#endif
 #endif
 
 #ifndef MMX_COEF

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ