Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Tue, 8 Sep 2015 17:18:51 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: md5crypt mmxput*()

On Tue, Sep 08, 2015 at 01:17:14PM +0300, Solar Designer wrote:
> Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 4x2]... (8xOMP) DONE
> Raw:    231424 c/s real, 28928 c/s virtual

> I think further speedup is possible by using a switch statement to make
> the shift counts into constants (we have an if anyway, we'll just
> replace it with a switch) like cryptmd5_kernel.cl has.

I cleaned up the code and implemented switch - patch attached.
It turned out to cause a minor performance regression on bull (due to
code size growth maybe?) so I am disabling it for XOP and keep the
performance almost the same as above:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 XOP 4x2]... (8xOMP) DONE
Raw:    231680 c/s real, 28923 c/s virtual

But it helps a lot on well and super.  well, with changes from earlier
today but not the switch yet:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 256/256 AVX2 8x3]... (8xOMP) DONE
Raw:    397824 c/s real, 49790 c/s virtual

with switch:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 256/256 AVX2 8x3]... (8xOMP) DONE
Raw:    425472 c/s real, 53184 c/s virtual

super, default gcc (old), version from a few days ago:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 AVX 4x3]... (32xOMP) DONE
Raw:    605184 c/s real, 18912 c/s virtual

with my changes from earlier today:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 AVX 4x3]... (32xOMP) DONE
Raw:    619008 c/s real, 19307 c/s virtual

with switch:

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 AVX 4x3]... (32xOMP) DONE
Raw:    638976 c/s real, 19943 c/s virtual

super's latest gcc (4.9.1 after "scl enable devtoolset-3 bash") with the
latest code (with switch):

Benchmarking: md5crypt, crypt(3) $1$ [MD5 128/128 AVX 4x3]... (32xOMP) DONE
Raw:    731136 c/s real, 22798 c/s virtual

IIRC, previously it was below 700k.

switch can probably be made beneficial for XOP as well if we reduce code
size elsewhere, but I had no luck with that so far (e.g., simply not
inlining the function causes a bigger performance regression).

Alexander

diff --git a/src/simd-intrinsics.c b/src/simd-intrinsics.c
index 7307bb8..a5251bc 100644
--- a/src/simd-intrinsics.c
+++ b/src/simd-intrinsics.c
@@ -436,63 +436,70 @@ static MAYBE_INLINE void mmxput2(void *buf, unsigned int bid, void *src)
 		memcpy( nbuf+i*64*VS32, ((unsigned char*)src)+i*16*VS32, 16*VS32);
 }
 
+#if (ARCH_SIZE >= 8) || defined(__i386__) || defined(__ARM_NEON__)
+#define BITALIGN(hi, lo, s) ((((uint64_t)(hi) << 32) | (lo)) >> (s))
+#else
+#define BITALIGN(hi, lo, s) (((hi) << (32 - (s))) | ((lo) >> (s)))
+#endif
+
 static MAYBE_INLINE void mmxput3(void *buf, unsigned int bid,
                                  unsigned int *offset, unsigned int mult,
                                  unsigned int saltlen, void *src)
 {
-	unsigned char *nbuf;
-	unsigned int noff;
-	unsigned int noffd;
-	unsigned int i,j;
-	unsigned int dec;
-
-	MD5_PARA_DO(j)
-	{
-		nbuf = ((unsigned char*)buf) + bid*64*MD5_SSE_NUM_KEYS + j*64*VS32;
-		for(i=0;i<VS32;i++)
-		{
-			noff = offset[i+j*VS32]*mult + saltlen;
-			dec = (noff&3)*8;
-			if(dec)
-			{
-				noffd = noff & (~3);
-#if (ARCH_SIZE >= 8) || defined(__i386__)
-#define BITALIGN(hi, lo, s) ((((uint64_t)(hi) << 32) | (lo)) >> (s))
+	unsigned int j;
+
+	MD5_PARA_DO(j) {
+		unsigned int i;
+		unsigned int jm = j * VS32 * 4;
+		unsigned char *nbuf = ((unsigned char *)buf) + bid * (64 * MD5_SSE_NUM_KEYS) + jm * 16;
+		unsigned int *s = (unsigned int *)src + jm;
+		for (i = 0; i < VS32; i++, s++) {
+			unsigned int n = offset[i + jm / 4] * mult + saltlen;
+			unsigned int *d = (unsigned int *)(nbuf + (n & ~3U) * VS32) + i;
+
+			switch (n &= 3) {
+			case 0:
+				d[0] = s[0];
+				d[1 * VS32] = s[1 * VS32];
+				d[2 * VS32] = s[2 * VS32];
+				d[3 * VS32] = s[3 * VS32];
+				break;
+#ifdef __XOP__
+			default:
+				n <<= 3;
+				{
+					unsigned int m = 32 - n;
+					d[0] = (d[0] & (0xffffffffU >> m)) | (s[0] << n);
+					d[1 * VS32] = BITALIGN(s[1 * VS32], s[0], m);
+					d[2 * VS32] = BITALIGN(s[2 * VS32], s[1 * VS32], m);
+					d[3 * VS32] = BITALIGN(s[3 * VS32], s[2 * VS32], m);
+					d[4 * VS32] = (d[4 * VS32] & (0xffffffffU << n)) | (s[3 * VS32] >> m);
+				}
 #else
-#define BITALIGN(hi, lo, s) (((hi) << (32 - (s))) | ((lo) >> (s)))
+			case 1:
+				d[0] = (d[0] & 0xffU) | (s[0] << 8);
+				d[1 * VS32] = BITALIGN(s[1 * VS32], s[0], 24);
+				d[2 * VS32] = BITALIGN(s[2 * VS32], s[1 * VS32], 24);
+				d[3 * VS32] = BITALIGN(s[3 * VS32], s[2 * VS32], 24);
+				d[4 * VS32] = (d[4 * VS32] & 0xffffff00U) | (s[3 * VS32] >> 24);
+				break;
+			case 2:
+				d[0] = (d[0] & 0xffffU) | (s[0] << 16);
+				d[1 * VS32] = BITALIGN(s[1 * VS32], s[0], 16);
+				d[2 * VS32] = BITALIGN(s[2 * VS32], s[1 * VS32], 16);
+				d[3 * VS32] = BITALIGN(s[3 * VS32], s[2 * VS32], 16);
+				d[4 * VS32] = (d[4 * VS32] & 0xffff0000U) | (s[3 * VS32] >> 16);
+				break;
+			case 3:
+				d[0] = (d[0] & 0xffffffU) | (s[0] << 24);
+				d[1 * VS32] = BITALIGN(s[1 * VS32], s[0], 8);
+				d[2 * VS32] = BITALIGN(s[2 * VS32], s[1 * VS32], 8);
+				d[3 * VS32] = BITALIGN(s[3 * VS32], s[2 * VS32], 8);
+				d[4 * VS32] = (d[4 * VS32] & 0xff000000U) | (s[3 * VS32] >> 8);
 #endif
-				((unsigned int*)(nbuf+noffd*VS32))[i+0*VS32] &=
-					(0xffffffff>>(32-dec));
-				((unsigned int*)(nbuf+noffd*VS32))[i+0*VS32] |=
-					(((unsigned int*)src)[i+j*4*VS32+0*VS32] << dec);
-				((unsigned int*)(nbuf+noffd*VS32))[i+1*VS32] = BITALIGN(
-					((unsigned int*)src)[i+j*4*VS32+1*VS32],
-					((unsigned int*)src)[i+j*4*VS32+0*VS32], 32 - dec);
-				((unsigned int*)(nbuf+noffd*VS32))[i+2*VS32] = BITALIGN(
-					((unsigned int*)src)[i+j*4*VS32+2*VS32],
-					((unsigned int*)src)[i+j*4*VS32+1*VS32], 32 - dec);
-				((unsigned int*)(nbuf+noffd*VS32))[i+3*VS32] = BITALIGN(
-					((unsigned int*)src)[i+j*4*VS32+3*VS32],
-					((unsigned int*)src)[i+j*4*VS32+2*VS32], 32 - dec);
-				((unsigned int*)(nbuf+noffd*VS32))[i+4*VS32] &=
-					(0xffffffff<<dec);
-				((unsigned int*)(nbuf+noffd*VS32))[i+4*VS32] |=
-					(((unsigned int*)src)[i+j*4*VS32+3*VS32] >> (32-dec));
-			}
-			else
-			{
-				((unsigned int*)(nbuf+noff*VS32))[i+0*VS32] =
-					((unsigned int*)src)[i+j*4*VS32+0*VS32];
-				((unsigned int*)(nbuf+noff*VS32))[i+1*VS32] =
-					((unsigned int*)src)[i+j*4*VS32+1*VS32];
-				((unsigned int*)(nbuf+noff*VS32))[i+2*VS32] =
-					((unsigned int*)src)[i+j*4*VS32+2*VS32];
-				((unsigned int*)(nbuf+noff*VS32))[i+3*VS32] =
-					((unsigned int*)src)[i+j*4*VS32+3*VS32];
 			}
 		}
 	}
-
 }
 
 static MAYBE_INLINE void dispatch(unsigned char buffers[8][64*MD5_SSE_NUM_KEYS],

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ