Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Sat, 7 Jul 2012 13:31:06 +0400
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: optimized mscash2-opencl

Sayantan, magnum -

I was puzzled by the fact that changing the "manual" rotates to rotate()
in pbkdf2_kernel.cl made it twice slower on HD 7970 (at least).  Today I
looked into this.  It turns out that Sayantan's version of the code
heavily relied on the compiler doing some non-trivial optimizations,
including figuring out that two of the four SHA-1 computations could be
moved out of the 10k-iterations loop.  Somehow the attempted change to
use rotate() was just enough to prevent that specific optimization.

Anyway, I've optimized the code to avoid relying on the compiler doing
this, and I made several other optimizations as well.  In the current
pbkdf2_kernel.cl the uses of rotate() and bitselect() no longer result
in any slowdown; however, they still don't result in any speedup as
well, which is puzzling.  Was the compiler good enough to generate the
proper instructions anyway? or does it still not do that?  We need to
examine the code to find out - at least IL if not native.  Sayantan -
this is now a task for you.

Before optimizations:

OpenCL platform 1: AMD Accelerated Parallel Processing, 2 device(s).
Using device 0: Tahiti
Optimal Work Group Size:256
Kernel Execution Speed (Higher is better):1.403122
Benchmarking: M$ Cache Hash 2 (DCC2) PBKDF2-HMAC-SHA-1 [OpenCL]... DONE
Raw:    92304 c/s real, 92467 c/s virtual

OpenCL platform 0: NVIDIA CUDA, 1 device(s).
Using device 0: GeForce GTX 570
Optimal Work Group Size:512
Kernel Execution Speed (Higher is better):0.416847
Benchmarking: M$ Cache Hash 2 (DCC2) PBKDF2-HMAC-SHA-1 [OpenCL]... DONE
Raw:    26900 c/s real, 26900 c/s virtual

After:

OpenCL platform 1: AMD Accelerated Parallel Processing, 2 device(s).
Using device 0: Tahiti
Optimal Work Group Size:256
Kernel Execution Speed (Higher is better):1.492774
Benchmarking: M$ Cache Hash 2 (DCC2) PBKDF2-HMAC-SHA-1 [OpenCL]... DONE
Raw:    97814 c/s real, 97632 c/s virtual

OpenCL platform 0: NVIDIA CUDA, 1 device(s).
Using device 0: GeForce GTX 570
Optimal Work Group Size:128
Kernel Execution Speed (Higher is better):0.491235
Benchmarking: M$ Cache Hash 2 (DCC2) PBKDF2-HMAC-SHA-1 [OpenCL]... DONE
Raw:    31852 c/s real, 31813 c/s virtual

This is +6% on AMD and +18% on NVIDIA.

Actual run:

$ ./john -i=alpha ~/john/contest-2011/hashes-all.txt-1.mscash2 -fo=mscash2-opencl -pla=1
OpenCL platform 1: AMD Accelerated Parallel Processing, 2 device(s).
Using device 0: Tahiti
Optimal Work Group Size:128
Kernel Execution Speed (Higher is better):1.492764
Loaded 1152 password hashes with 1090 different salts (M$ Cache Hash 2 (DCC2) PBKDF2-HMAC-SHA-1 [OpenCL])
guesses: 0  time: 0:00:00:12 0.00%  c/s: 21260  trying: bara - choedia
guesses: 0  time: 0:00:00:15 0.00%  c/s: 52393  trying: bara - choedia
salart           (gemignani)
guesses: 1  time: 0:00:00:17 0.00%  c/s: 59208  trying: bara - choedia
starter          (bevilaqua)
guesses: 2  time: 0:00:00:23 0.00%  c/s: 68177  trying: bara - choedia
guesses: 2  time: 0:00:01:43 0.00%  c/s: 96292  trying: bara - choedia
moones           (alexino)
guesses: 3  time: 0:00:02:02 0.00%  c/s: 96526  trying: bara - choedia
guesses: 3  time: 0:00:03:14 0.00%  c/s: 99710  trying: bara - choedia
assica           (bersamina)
mingui           (abisheva)
guesses: 5  time: 0:00:03:54 0.00%  c/s: 101588  trying: bara - choedia
annico           (boediman)
guesses: 6  time: 0:00:04:21 0.00%  c/s: 101194  trying: bara - choedia
stephat          (bamigboye)
guesses: 7  time: 0:00:04:56 0.00%  c/s: 100800  trying: bara - choedia
storine          (arient)
guesses: 8  time: 0:00:05:39 0.00%  c/s: 100352  trying: bara - choedia
aritta           (chamieh)
streles          (aquinde)
monies           (bercasio)
merrate          (figuera)
meless           (fiander)
starine          (clavier)
stomara          (elhadidi)
stronie          (elizan)
shoria           (daveii)
mistom           (bhuriwale)
alamel           (deblasis)
ashame           (bareis)
arandy           (ghazalie)
samali           (baubie)
stronia          (binduhewa)
metale           (bazier)
mereko           (aleksi)
guesses: 25  time: 0:00:12:01 0.00%  c/s: 101062  trying: bara - choedia
stramos          (empabido)
artico           (fallangie)
ashona           (estacion)
arishi           (elvina)
sherie           (dilawer)
andrin           (alawieh)
guesses: 31  time: 0:00:17:06 0.00%  c/s: 101869  trying: bara - choedia
artie            (heilemann)
merens           (heinzmann)
standan          (gilead)
artal            (adrienne)
anness           (beccaria)
guesses: 36  time: 0:00:18:32 0.00%  c/s: 101776  trying: bara - choedia
shomos           (basie)
mandia           (artillery)
annane           (azizieh)
guesses: 39  time: 0:00:19:23 0.00%  c/s: 101600  trying: bara - choedia
stepine          (hemmati)
guesses: 40  time: 0:00:20:27 0.00%  c/s: 101404  trying: bara - choedia
sarone           (bangie)
ashoon           (abhulimen)
storten          (akinremi)
misamo           (gravelin)
guesses: 44  time: 0:00:21:03 0.00%  c/s: 101483  trying: bara - choedia
stepand          (egnario)
guesses: 45  time: 0:00:21:48 0.00%  c/s: 101356  trying: bara - choedia

Default Adapter - AMD Radeon HD 7900 Series
                  Sensor 0: Temperature - 86.00 C

Default Adapter - AMD Radeon HD 7900 Series
                            Core (MHz)    Memory (MHz)
           Current Clocks :    925           1375
             Current Peak :    925           1375
  Configurable Peak Range : [300-1125]     [150-1575]
                 GPU load :    98%

Alexander

diff --git a/src/opencl/pbkdf2_kernel.cl b/src/opencl/pbkdf2_kernel.cl
index 9612e9f..18a236e 100644
--- a/src/opencl/pbkdf2_kernel.cl
+++ b/src/opencl/pbkdf2_kernel.cl
@@ -42,11 +42,9 @@
 }
 #endif
 
-#define S1(x) ((x << 1) | ((x ) >> 31))
-
-#define S5(x) ((x << 5) | ((x ) >> 27))
-
-#define S30(x) ((x << 30) | ((x ) >> 2))
+#define S1(x) rotate((x), (uint)1)
+#define S5(x) rotate((x), (uint)5)
+#define S30(x) rotate((x), (uint)30)
 
 #define R0                                              \
 (                                                       \
@@ -348,7 +346,8 @@
 
 inline void SHA1(__private uint *A,__private uint *W)
 {
-#define F(x,y,z) (z ^ (x & (y ^ z)))
+//#define F(x,y,z) (z ^ (x & (y ^ z)))
+#define F(x,y,z) bitselect(z, y, x)
 #define K 0x5A827999
 	SHA1_part0(A[0],A[1],A[2],A[3],A[4],W);
 #undef K
@@ -402,26 +401,26 @@ inline void SHA1_digest(__private uint *A,__private uint *W)
 
 }
 
-inline void hmac_sha1(__private uint *ipad,__private uint *opad,__private uint *state,private uint *buf, __private uint *temp_char){
-	
-        uint A[5],W[16];
-        
-        GET_WORD_32_BE(W[0], ipad, 0);
-	GET_WORD_32_BE(W[1], ipad, 1);
-	GET_WORD_32_BE(W[2], ipad, 2);
-	GET_WORD_32_BE(W[3], ipad, 3);
-	GET_WORD_32_BE(W[4], ipad, 4);
-	GET_WORD_32_BE(W[5], ipad, 5);
-	GET_WORD_32_BE(W[6], ipad, 6);
-	GET_WORD_32_BE(W[7], ipad, 7);
-	GET_WORD_32_BE(W[8], ipad, 8);
-	GET_WORD_32_BE(W[9], ipad, 9);
-	GET_WORD_32_BE(W[10], ipad, 10);
-	GET_WORD_32_BE(W[11], ipad, 11);
-	GET_WORD_32_BE(W[12], ipad, 12);
-	GET_WORD_32_BE(W[13], ipad, 13);
-	GET_WORD_32_BE(W[14], ipad, 14);
-	GET_WORD_32_BE(W[15], ipad, 15);
+inline void sha1_pad(__private uint *pad, __private uint *state)
+{
+	uint A[5], W[16];
+
+	GET_WORD_32_BE(W[0], pad, 0);
+	GET_WORD_32_BE(W[1], pad, 1);
+	GET_WORD_32_BE(W[2], pad, 2);
+	GET_WORD_32_BE(W[3], pad, 3);
+	GET_WORD_32_BE(W[4], pad, 4);
+	GET_WORD_32_BE(W[5], pad, 5);
+	GET_WORD_32_BE(W[6], pad, 6);
+	GET_WORD_32_BE(W[7], pad, 7);
+	GET_WORD_32_BE(W[8], pad, 8);
+	GET_WORD_32_BE(W[9], pad, 9);
+	GET_WORD_32_BE(W[10], pad, 10);
+	GET_WORD_32_BE(W[11], pad, 11);
+	GET_WORD_32_BE(W[12], pad, 12);
+	GET_WORD_32_BE(W[13], pad, 13);
+	GET_WORD_32_BE(W[14], pad, 14);
+	GET_WORD_32_BE(W[15], pad, 15);
 
 	A[0] = INIT_SHA1_A;
 	A[1] = INIT_SHA1_B;
@@ -429,7 +428,7 @@ inline void hmac_sha1(__private uint *ipad,__private uint *opad,__private uint *
 	A[3] = INIT_SHA1_D;
 	A[4] = INIT_SHA1_E;
 
-SHA1(A,W);
+	SHA1(A, W);
 
 	A[0] += INIT_SHA1_A;
 	A[1] += INIT_SHA1_B;
@@ -442,6 +441,17 @@ SHA1(A,W);
 	state[2] = A[2];
 	state[3] = A[3];
 	state[4] = A[4];
+}
+
+inline void hmac_sha1(__private uint *istate, __private uint *ostate, __private uint *buf)
+{
+	uint A[5], W[16];
+
+	A[0] = istate[0];
+	A[1] = istate[1];
+	A[2] = istate[2];
+	A[3] = istate[3];
+	A[4] = istate[4];
 
 	GET_WORD_32_BE(W[0], buf, 0);
 	GET_WORD_32_BE(W[1], buf, 1);
@@ -460,13 +470,13 @@ SHA1(A,W);
 	GET_WORD_32_BE(W[14], buf, 14);
 	GET_WORD_32_BE(W[15], buf, 15);
 
-SHA1(A,W);
+	SHA1(A, W);
 
-	A[0] += state[0];
-	A[1] += state[1];
-	A[2] += state[2];
-	A[3] += state[3];
-	A[4] += state[4];
+	A[0] += istate[0];
+	A[1] += istate[1];
+	A[2] += istate[2];
+	A[3] += istate[3];
+	A[4] += istate[4];
 
 	PUT_WORD_32_BE(A[0], buf, 0);
 	PUT_WORD_32_BE(A[1], buf, 1);
@@ -478,42 +488,11 @@ SHA1(A,W);
 
 	PUT_WORD_32_BE(0x2A0, buf, 15);
 
-	GET_WORD_32_BE(W[0], opad, 0);
-	GET_WORD_32_BE(W[1], opad, 1);
-	GET_WORD_32_BE(W[2], opad, 2);
-	GET_WORD_32_BE(W[3], opad, 3);
-	GET_WORD_32_BE(W[4], opad, 4);
-	GET_WORD_32_BE(W[5], opad, 5);
-	GET_WORD_32_BE(W[6], opad, 6);
-	GET_WORD_32_BE(W[7], opad, 7);
-	GET_WORD_32_BE(W[8], opad, 8);
-	GET_WORD_32_BE(W[9], opad, 9);
-	GET_WORD_32_BE(W[10], opad, 10);
-	GET_WORD_32_BE(W[11], opad, 11);
-	GET_WORD_32_BE(W[12], opad, 12);
-	GET_WORD_32_BE(W[13], opad, 13);
-	GET_WORD_32_BE(W[14], opad, 14);
-	GET_WORD_32_BE(W[15], opad, 15);
-
-	A[0] = INIT_SHA1_A;
-	A[1] = INIT_SHA1_B;
-	A[2] = INIT_SHA1_C;
-	A[3] = INIT_SHA1_D;
-	A[4] = INIT_SHA1_E;
-
-SHA1(A,W);
-
-	A[0] += INIT_SHA1_A;
-	A[1] += INIT_SHA1_B;
-	A[2] += INIT_SHA1_C;
-	A[3] += INIT_SHA1_D;
-	A[4] += INIT_SHA1_E;
-
-	state[0] = A[0];
-	state[1] = A[1];
-	state[2] = A[2];
-	state[3] = A[3];
-	state[4] = A[4];
+	A[0] = ostate[0];
+	A[1] = ostate[1];
+	A[2] = ostate[2];
+	A[3] = ostate[3];
+	A[4] = ostate[4];
 
 	GET_WORD_32_BE(W[0], buf, 0);
 	GET_WORD_32_BE(W[1], buf, 1);
@@ -521,31 +500,99 @@ SHA1(A,W);
 	GET_WORD_32_BE(W[3], buf, 3);
 	GET_WORD_32_BE(W[4], buf, 4);
 	W[5] = 0x80000000;
-        W[6]=0;
-	W[7]=0;
-	W[8]=0;
-	W[9]=0;
-	W[10]=0;
-	W[11]=0;
-	W[12]=0;
-	W[13]=0;
-	W[14]=0;
-	W[15] = 0x2A0;  
-
-SHA1_digest(A,W);
-
-	A[0] += state[0];
-	A[1] += state[1];
-	A[2] += state[2];
-	A[3] += state[3];
-	A[4] += state[4];
-
-        PUT_WORD_32_BE(A[0], temp_char, 0);
-	PUT_WORD_32_BE(A[1], temp_char, 1);
-	PUT_WORD_32_BE(A[2], temp_char, 2);
-	PUT_WORD_32_BE(A[3], temp_char, 3);
-	PUT_WORD_32_BE(A[4], temp_char, 4);
+        W[6] = 0;
+	W[7] = 0;
+	W[8] = 0;
+	W[9] = 0;
+	W[10] = 0;
+	W[11] = 0;
+	W[12] = 0;
+	W[13] = 0;
+	W[14] = 0;
+	W[15] = 0x2A0;
+
+	SHA1_digest(A, W);
+
+	A[0] += ostate[0];
+	A[1] += ostate[1];
+	A[2] += ostate[2];
+	A[3] += ostate[3];
+	A[4] += ostate[4];
+
+        PUT_WORD_32_BE(A[0], buf, 0);
+	PUT_WORD_32_BE(A[1], buf, 1);
+	PUT_WORD_32_BE(A[2], buf, 2);
+	PUT_WORD_32_BE(A[3], buf, 3);
+	PUT_WORD_32_BE(A[4], buf, 4);
+}
 
+inline void hmac_sha1_iter(__private uint *istate, __private uint *ostate, __private uint *buf, __private uint *out)
+{
+	unsigned int i;
+	uint A[5], W[16];
+
+	for (i = 1; i < ITERATIONS; i++) {
+		W[0] = buf[0];
+		W[1] = buf[1];
+		W[2] = buf[2];
+		W[3] = buf[3];
+		W[4] = buf[4];
+		W[5] = 0x80000000;
+		W[6] = 0;
+		W[7] = 0;
+		W[8] = 0;
+		W[9] = 0;
+		W[10] = 0;
+		W[11] = 0;
+		W[12] = 0;
+		W[13] = 0;
+		W[14] = 0;
+		W[15] = 0x2A0;
+
+		A[0] = istate[0];
+		A[1] = istate[1];
+		A[2] = istate[2];
+		A[3] = istate[3];
+		A[4] = istate[4];
+
+		SHA1_digest(A, W);
+
+		W[0] = A[0] + istate[0];
+		W[1] = A[1] + istate[1];
+		W[2] = A[2] + istate[2];
+		W[3] = A[3] + istate[3];
+		W[4] = A[4] + istate[4];
+		W[5] = 0x80000000;
+		W[6] = 0;
+		W[7] = 0;
+		W[8] = 0;
+		W[9] = 0;
+		W[10] = 0;
+		W[11] = 0;
+		W[12] = 0;
+		W[13] = 0;
+		W[14] = 0;
+		W[15] = 0x2A0;
+
+		A[0] = ostate[0];
+		A[1] = ostate[1];
+		A[2] = ostate[2];
+		A[3] = ostate[3];
+		A[4] = ostate[4];
+
+		SHA1_digest(A, W);
+
+		buf[0] = A[0] + ostate[0];
+		buf[1] = A[1] + ostate[1];
+		buf[2] = A[2] + ostate[2];
+		buf[3] = A[3] + ostate[3];
+		buf[4] = A[4] + ostate[4];
+
+		out[0] ^= buf[0];
+		out[1] ^= buf[1];
+		out[2] ^= buf[2];
+		out[3] ^= buf[3];
+	}
 }
 
 __kernel 
@@ -576,9 +623,7 @@ void PBKDF2 ( const __global unsigned int *pass_global,
 
 #define SHA1_DIGEST_LENGTH_by_4 SHA1_DIGEST_LENGTH/4
 	
-	uint temp_char[SHA1_DIGEST_LENGTH_by_4];
-	
-	unsigned int state[5],out[4];
+	unsigned int istate[5], ostate[5], out[4];
 	
 	unsigned int ipad[16];
 	
@@ -623,42 +668,24 @@ void PBKDF2 ( const __global unsigned int *pass_global,
 		opad[j] = opad[j] ^ pass[j];
 	  }
 
-	hmac_sha1(ipad,opad,state,buf,temp_char);
-	
-	out[0] = temp_char[0];
-	out[1] = temp_char[1];
-	out[2] = temp_char[2];
-	out[3] = temp_char[3];
+	sha1_pad(ipad, istate);
+	sha1_pad(opad, ostate);
 
-        for (i = 0; i < 16; i++) 
-		buf[i] = 0;
-	
-     
-	for (i = 1; i < ITERATIONS; i++) {
-			
-		
-		buf[0] = temp_char[0];
-		buf[1] = temp_char[1];
-		buf[2] = temp_char[2];
-		buf[3] = temp_char[3];
-		buf[4] = temp_char[4];
+	hmac_sha1(istate, ostate, buf);
 
-		buf[SHA1_DIGEST_LENGTH_by_4] =  0x80 | buf[SHA1_DIGEST_LENGTH_by_4];
-		
-		PUT_WORD_32_BE(0x2A0, buf, 15);
+        for (i = 0; i < 5; i++) 
+		GET_WORD_32_BE(buf[i], buf, i);
 
-		hmac_sha1(ipad,opad,state,buf,temp_char);
+	out[0] = buf[0];
+	out[1] = buf[1];
+	out[2] = buf[2];
+	out[3] = buf[3];
 
-		out[0] ^= temp_char[0];
-		out[1] ^= temp_char[1];
-		out[2] ^= temp_char[2];
-		out[3] ^= temp_char[3];
+	hmac_sha1_iter(istate, ostate, buf, out);
 
-	}
-	
-	i=id*4;
-	out_global[i++]=out[0];
-	out_global[i++]=out[1];
-	out_global[i++]=out[2];
-	out_global[i]=out[3];
+	i = id * 4;
+	PUT_WORD_32_BE(out[0], out_global, i++);
+	PUT_WORD_32_BE(out[1], out_global, i++);
+	PUT_WORD_32_BE(out[2], out_global, i++);
+	PUT_WORD_32_BE(out[3], out_global, i);
 }

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ