Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Fri, 14 Sep 2012 17:08:09 +0400
From: Aleksey Cherepanov <aleksey.4erepanov@...il.com>
To: john-dev@...ts.openwall.com
Subject: intrinsics: speed up for linux-x86-64-native

(I caught cold so I postponed my todos and get fun learning
intrinsics at this moment.)

Looking over sse-intrinsics.c I noticed weird thing: multiple
MD5_PARA_DO cycles when it is possible to write one cycle over
everything and avoid use of tmp variable. I tried to avoid some cycles
and got a speed up. But when I merged them into one cycle per MD5_STEP
I got a significant slowdown.

Imperative observation is that _mm_add_epi32 with result of other
_mm_add_epi32 as direct argument is a bad idea while mixing of
different instructions is ok (all other instructions are logical that
should be trivial and fast under any circumstances (except andnot), I
guess). So I guess it is caused by instruction's latencies (hint from
Alexander Cherepanov).

I found https://developer.apple.com/hardwaredrivers/ve/sse.html :
section "Pipelines, Latencies and Unrolling" explains a bit about
optimization but I got a slowdown again trying to merge and/or unroll
remaining cycles. How could I understand that I exhausted cache?

Also I tried to change MD5_SSE_PARA value but no luck.

I attach a patch with the best combination without intrusive changes
(i.e. not splitting MD5_{F,G,H,I} functions to mix their instructions
with addition). It gives noticeable speed up for md5 based formats.
Though speed of this format is still far from intrinsics compiled with
icc (obtained from not patched intrinsics).

Also I quickly tried to do the same for md4 (nt format) and got speed
equal to icc's intrinsics in many combos including one cycle per step.

I did not tried sha1 because I am afraid that I am on the wrong way:
does not my changes work only on my computer/compiler setup (core i7,
gcc 4.7.1)? What are the reasons for so many cycles?

If everything is ok I could prepare patches for md4 and sha1, should
I? Should I try more intrusive changes?

Thanks!

-- 
Regards,
Aleksey Cherepanov

commit 2cc0400512c53704230c06838efd8870e91fcaf3
Author: Aleksey Cherepanov <aleksey.4erepanov@...il.com>
Date:   Fri Sep 14 17:04:40 2012 +0400

    experimental speed up of md5 intrinsics for linux-x86-64-native

diff --git a/src/sse-intrinsics.c b/src/sse-intrinsics.c
index c09d6ad..74aae16 100644
--- a/src/sse-intrinsics.c
+++ b/src/sse-intrinsics.c
@@ -43,40 +43,31 @@
 
 #ifdef __XOP__
 #define MD5_F(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_cmov_si128((y[i]),(z[i]),(x[i]));
+	_mm_cmov_si128((y[i]),(z[i]),(x[i]))
 #else
 #define MD5_F(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((y[i]),(z[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_and_si128((tmp[i]),(x[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((tmp[i]),(z[i]));
+	_mm_xor_si128(_mm_and_si128(_mm_xor_si128((y[i]),(z[i])),(x[i])),(z[i]))
 #endif
 
 #ifdef __XOP__
 #define MD5_G(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_cmov_si128((x[i]),(y[i]),(z[i]));
+	_mm_cmov_si128((x[i]),(y[i]),(z[i]))
 #else
 #define MD5_G(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((y[i]),(x[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_and_si128((tmp[i]),(z[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((tmp[i]), (y[i]) );
+	_mm_xor_si128(_mm_and_si128(_mm_xor_si128((y[i]),(x[i])),(z[i])), (y[i]))
 #endif
 
 #define MD5_H(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((y[i]),(z[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((tmp[i]),(x[i]));
+	_mm_xor_si128(_mm_xor_si128((y[i]),(z[i])),(x[i]))
 
 #define MD5_I(x,y,z) \
-	MD5_PARA_DO(i) tmp[i] = _mm_andnot_si128((z[i]), mask); \
-	MD5_PARA_DO(i) tmp[i] = _mm_or_si128((tmp[i]),(x[i])); \
-	MD5_PARA_DO(i) tmp[i] = _mm_xor_si128((tmp[i]),(y[i]));
+	_mm_xor_si128(_mm_or_si128(_mm_andnot_si128((z[i]), mask),(x[i])),(y[i]))
 
 #define MD5_STEP(f, a, b, c, d, x, t, s) \
 	MD5_PARA_DO(i) a[i] = _mm_add_epi32( a[i], _mm_set_epi32(t,t,t,t) ); \
-	f((b),(c),(d)) \
-	MD5_PARA_DO(i) a[i] = _mm_add_epi32( a[i], tmp[i] ); \
-	MD5_PARA_DO(i) a[i] = _mm_add_epi32( a[i], data[i*16+x] ); \
-	MD5_PARA_DO(i) a[i] = _mm_roti_epi32( a[i], (s) ); \
-	MD5_PARA_DO(i) a[i] = _mm_add_epi32( a[i], b[i] );
+	MD5_PARA_DO(i) a[i] = _mm_add_epi32( a[i], f((b),(c),(d)) ); \
+	MD5_PARA_DO(i) a[i] = _mm_add_epi32( \
+		_mm_roti_epi32( _mm_add_epi32(a[i], data[i*16+x]), (s) ), b[i] );
 
 unsigned int debug = 0;
 
@@ -91,7 +82,6 @@ void SSEmd5body(__m128i* data, unsigned int * out, int init)
 	__m128i b[MD5_SSE_PARA];
 	__m128i c[MD5_SSE_PARA];
 	__m128i d[MD5_SSE_PARA];
-	__m128i tmp[MD5_SSE_PARA];
 	__m128i mask;
 	unsigned int i;
 

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ