Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [day] [month] [year] [list]
Date: Tue, 24 Apr 2012 21:49:14 +0200
From: magnum <john.magnum@...hmail.com>
To: john-dev@...ts.openwall.com
Subject: Vectorizing OpenCL

I had a go at vectorizing raw-sha1-opencl just to see how it's done. It
was *extremely* easy (I had a sneak at Lukas' phpass for hints).

I expected it to boost AMD GPU's but it doesn't (not on Cedar at least,
12% drop). It had a devastating effect on 9600GT, about half the speed.
and GTX580 had a 20% drop.

For some reason, Intel's OpenCL-CPU compiler fails to vectorize it (ie.
make SSE2 out of it). That's strange. But the AMD CPU compiler seemed to
get the hint, it doesn't say anything in the build log but got like 3x
faster (though it was much slower than Intel before the change).

All in all, there seems to be no point in using this code. I'm posting
it here as an example. Nothing is optimised.

magnum

>From 748ca68e96c22374771f789a70118189ba3c1a24 Mon Sep 17 00:00:00 2001
From: magnum <john.magnum@...hmail.com>
Date: Tue, 24 Apr 2012 02:14:03 +0200
Subject: [PATCH 1/3] Raw-sha1-openssl vectorized (just for getting the hang
 of it). Not (much) optimised yet, just textbook example
 of how to vectorize.

---
 src/opencl/sha1_kernel.cl |  100 +++++++++++++++++++++++++-------------------
 src/opencl_rawsha1_fmt.c  |    8 +++-
 2 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/src/opencl/sha1_kernel.cl b/src/opencl/sha1_kernel.cl
index 9a1ac9d..5baef73 100644
--- a/src/opencl/sha1_kernel.cl
+++ b/src/opencl/sha1_kernel.cl
@@ -9,8 +9,16 @@
    This is free software, and you are welcome to redistribute it
    under certain conditions; as expressed here 
    http://www.gnu.org/licenses/gpl-2.0.html
+
+   vectorized by magnum in 2012 just for fun
 */
 
+// Just to make sure we don't accidentally add byte store
+// uchar4 is not byte store!
+#ifdef cl_khr_byte_addressable_store
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
+#endif
+
 #define K0  0x5A827999
 #define K1  0x6ED9EBA1
 #define K2  0x8F1BBCDC
@@ -22,20 +30,14 @@
 #define H4 0x10325476
 #define H5 0xC3D2E1F0
 
-#ifndef uint32_t
-#define uint32_t unsigned int
-#endif
-
-
 __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_key,  __global uint *digest){
     int t, gid, msg_pad;
     int i, stop, mmod;
     uint ulen;
-    uint W[16], temp, A,B,C,D,E;
+	uint4 W[16], temp, A,B,C,D,E;
     uint num_keys = data_info[1];
     
-    gid = get_global_id(0);
-    msg_pad = gid * data_info[0];
+	gid = 4 * get_global_id(0);
 
     A = H1;
     B = H2;
@@ -46,50 +48,56 @@ __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_ke
     for (t = 1; t < 15; t++){
 	W[t] = 0x00000000;
     }
-    for(i = 0; i < data_info[0] && ((uchar) plain_key[msg_pad + i]) != 0x0 ; i++){
-    }
 
-    stop = i / 4 ;
-    for (t = 0 ; t < stop ; t++){
-        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
-        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
-        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
-        W[t] |= (uchar)  plain_key[msg_pad + t * 4 + 3];
-    }
-    mmod = i % 4;
-    if ( mmod == 3){
-        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
-        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
-        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
-        W[t] |=  ((uchar) 0x80) ;
-    } else if (mmod == 2) {
-        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
-        W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
-        W[t] |=  0x8000 ;
-    } else if (mmod == 1) {
-        W[t] = ((uchar)  plain_key[msg_pad + t * 4]) << 24;
-        W[t] |=  0x800000 ;
-    } else if (mmod == 0){
-        W[t] =  0x80000000 ;
-    }
-    ulen = (i * 8) & 0xFFFFFFFF;
-    W[15] =  ulen ;   
+#define P(j) \
+	msg_pad = (gid + j) * data_info[0]; \
+	for(i = 0; i < data_info[0] && (plain_key[msg_pad + i]) != 0x0; i++); \
+		stop = i / 4;\
+		for (t = 0; t < stop; t++){\
+			W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\
+			W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\
+			W[t].s##j |= (plain_key[msg_pad + t * 4 + 2]) << 8;\
+			W[t].s##j |= plain_key[msg_pad + t * 4 + 3];\
+		}\
+		mmod = i % 4;\
+		if ( mmod == 3){\
+			W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\
+			W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\
+			W[t].s##j |= (plain_key[msg_pad + t * 4 + 2]) << 8;\
+			W[t].s##j |=  ((uchar) 0x80);\
+		} else if (mmod == 2) {\
+			W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\
+			W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\
+			W[t].s##j |=  0x8000;\
+		} else if (mmod == 1) {\
+			W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\
+			W[t].s##j |=  0x800000;\
+		} else if (mmod == 0){\
+			W[t].s##j =  0x80000000;\
+		}\
+		ulen = (i * 8) & 0xFFFFFFFF;\
+		W[15].s##j = ulen;
+	P(0);
+	P(1);
+	P(2);
+	P(3);
 
 #undef R
 #define R(t)                                              \
 (                                                         \
     temp = W[(t -  3) & 0x0F] ^ W[(t - 8) & 0x0F] ^       \
            W[(t - 14) & 0x0F] ^ W[ t      & 0x0F],        \
-    ( W[t & 0x0F] = rotate((int)temp,1) )                 \
+		( W[t & 0x0F] = rotate(temp,1) ) \
 )
 
 #undef P
 #define P(a,b,c,d,e,x)                                    \
 {                                                         \
-    e += rotate((int)a,5) + F(b,c,d) + K + x; b = rotate((int)b,30);\
+		e += rotate(a,5) + F(b,c,d) + K + x; b = rotate(b,30); \
 }
 
-#define F(x,y,z) (z ^ (x & (y ^ z)))
+//#define F(x,y,z) (z ^ (x & (y ^ z)))
+#define F(x, y, z) bitselect((z), (y), (x))
 #define K 0x5A827999
   
   P( A, B, C, D, E, W[0]  );
@@ -196,10 +204,16 @@ __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_ke
 
 #undef K
 #undef F
-  digest[gid] = as_uint(as_uchar4(A + H1).wzyx);
-  digest[gid+1*num_keys] = as_uint(as_uchar4(B + H2).wzyx);
-  digest[gid+2*num_keys] = as_uint(as_uchar4(C + H3).wzyx);
-  digest[gid+3*num_keys] = as_uint(as_uchar4(D + H4).wzyx);
-  digest[gid+4*num_keys] = as_uint(as_uchar4(E + H5).wzyx);
 
+#undef P
+#define P(j) \
+	digest[gid+j] = (A + H1).s##j; \
+	digest[(gid+j)+1*num_keys] = (B + H2).s##j; \
+	digest[(gid+j)+2*num_keys] = (C + H3).s##j;\
+	digest[(gid+j)+3*num_keys] = (D + H4).s##j;\
+	digest[(gid+j)+4*num_keys] = (E + H5).s##j
+	P(0);
+	P(1);
+	P(2);
+	P(3);
 }
diff --git a/src/opencl_rawsha1_fmt.c b/src/opencl_rawsha1_fmt.c
index 6b88ec8..4b25d75 100644
--- a/src/opencl_rawsha1_fmt.c
+++ b/src/opencl_rawsha1_fmt.c
@@ -52,7 +52,7 @@ cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_i
 static cl_uint *partial_hashes;
 static cl_uint *res_hashes;
 static char *saved_plain;
-static size_t global_work_size = SHA_NUM_KEYS;
+static size_t global_work_size = SHA_NUM_KEYS / 4;
 static unsigned int datai[2];
 static int have_full_hashes;
 
@@ -151,7 +151,7 @@ static void create_clobj(int kpc){
 
 	datai[0] = PLAINTEXT_LENGTH;
 	datai[1] = kpc;
-	global_work_size = kpc;
+	global_work_size = kpc / 4;
 }
 
 static void release_clobj(void){
@@ -277,6 +277,10 @@ static void *binary(char *ciphertext){
 		    atoi16[ARCH_INDEX(ciphertext[i * 2])] * 16 +
 		    atoi16[ARCH_INDEX(ciphertext[i * 2 + 1])];
 	}
+	/* We swap once here, and avoid billions of
+	   swaps in the inner loop. */
+	alter_endianity(realcipher, BINARY_SIZE);
+
 	return (void *) realcipher;
 }
 
-- 
1.7.5.4


Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ