Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Sat, 5 Sep 2015 08:16:39 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: MD5 on XOP, NEON, AltiVec

On Sat, Sep 05, 2015 at 07:17:49AM +0300, Solar Designer wrote:
> On Sat, Sep 05, 2015 at 05:25:16AM +0300, Solar Designer wrote:
> > Here's what we had last year:
> > 
> > Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 8x]... (8xOMP) DONE
> > Raw:    201472 c/s real, 25152 c/s virtual
> > 
> > Here's what we have now:
> > 
> > Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 4x2]... (8xOMP) DONE
> > Raw:    150272 c/s real, 18784 c/s virtual
> 
> I sort of found it: somehow the code handling SSEi_FLAT_OUT, when
> compiled in, changes the stack frame layout in such a way that
> performance drops.  I wasn't yet able to tell why it drops.  The
> offsets look properly aligned to me either way.
> 
> With SSEi_FLAT_OUT support #if 0'ed out, I get:
> 
> Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 4x2]... (8xOMP) DONE
> Raw:    223232 c/s real, 27869 c/s virtual
> 
> And that's not even with the MD5_I change yet (haven't tried it yet).

With optimized I():

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 4x2]... (8xOMP) DONE
Raw:    228352 c/s real, 28579 c/s virtual

Patch attached.  It also includes MD4 optimizations, which I'll describe
separately.

Alexander

diff --git a/src/simd-intrinsics.c b/src/simd-intrinsics.c
index feca6a7..97e67ec 100644
--- a/src/simd-intrinsics.c
+++ b/src/simd-intrinsics.c
@@ -90,10 +90,16 @@ _inline __m128i _mm_set1_epi64(long long a)
     tmp[i] = vxor((tmp[i]),(x[i]));
 #endif
 
+#if !VCMOV_EMULATED
+#define MD5_I(x,y,z)                            \
+    tmp[i] = vcmov((x[i]), mask, (z[i])); \
+    tmp[i] = vxor((tmp[i]), (y[i]));
+#else
 #define MD5_I(x,y,z)                            \
     tmp[i] = vandnot((z[i]), mask);             \
     tmp[i] = vor((tmp[i]),(x[i]));              \
     tmp[i] = vxor((tmp[i]),(y[i]));
+#endif
 
 #define MD5_STEP(f, a, b, c, d, x, t, s)            \
     MD5_PARA_DO(i) {                                \
@@ -332,6 +338,7 @@ void SIMDmd5body(vtype* _data, unsigned int *out,
 		}
 	}
 
+#if 0
 	if (SSEi_flags & SSEi_FLAT_OUT) {
 		MD5_PARA_DO(i)
 		{
@@ -364,7 +371,9 @@ void SIMDmd5body(vtype* _data, unsigned int *out,
 #endif
 		}
 	}
-	else if (SSEi_flags & SSEi_OUTPUT_AS_INP_FMT)
+	else
+#endif
+	if (SSEi_flags & SSEi_OUTPUT_AS_INP_FMT)
 	{
 		if ((SSEi_flags & SSEi_OUTPUT_AS_2BUF_INP_FMT) == SSEi_OUTPUT_AS_2BUF_INP_FMT) {
 			MD5_PARA_DO(i)
@@ -667,13 +676,19 @@ void md5cryptsse(unsigned char pwd[MD5_SSE_NUM_KEYS][16], unsigned char *salt,
 #define MD4_F(x,y,z)                            \
     tmp[i] = vcmov((y[i]),(z[i]),(x[i]));
 
+#if !VCMOV_EMULATED
+#define MD4_G(x,y,z)                            \
+    tmp[i] = vxor((y[i]), (z[i]));              \
+    tmp[i] = vcmov((x[i]), (z[i]), (tmp[i]));
+#else
 #define MD4_G(x,y,z)                            \
     tmp[i] = vor((y[i]),(z[i]));                \
     tmp2[i] = vand((y[i]),(z[i]));              \
     tmp[i] = vand((tmp[i]),(x[i]));             \
     tmp[i] = vor((tmp[i]), (tmp2[i]) );
+#endif
 
-#if SIMD_PARA_MD4 == 1
+#if SIMD_PARA_MD4 < 3
 #define MD4_H(x,y,z)                            \
     tmp2[i] = vxor((x[i]),(y[i]));              \
     tmp[i] = vxor(tmp2[i], (z[i]));
@@ -907,6 +922,7 @@ void SIMDmd4body(vtype* _data, unsigned int *out, ARCH_WORD_32 *reload_state,
 		}
 	}
 
+#if 0
 	if (SSEi_flags & SSEi_FLAT_OUT) {
 		MD4_PARA_DO(i)
 		{
@@ -939,7 +955,9 @@ void SIMDmd4body(vtype* _data, unsigned int *out, ARCH_WORD_32 *reload_state,
 #endif
 		}
 	}
-	else if (SSEi_flags & SSEi_OUTPUT_AS_INP_FMT)
+	else
+#endif
+	if (SSEi_flags & SSEi_OUTPUT_AS_INP_FMT)
 	{
 		if ((SSEi_flags & SSEi_OUTPUT_AS_2BUF_INP_FMT) == SSEi_OUTPUT_AS_2BUF_INP_FMT) {
 			MD4_PARA_DO(i)

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ