>From 748ca68e96c22374771f789a70118189ba3c1a24 Mon Sep 17 00:00:00 2001 From: magnum Date: Tue, 24 Apr 2012 02:14:03 +0200 Subject: [PATCH 1/3] Raw-sha1-openssl vectorized (just for getting the hang of it). Not (much) optimised yet, just textbook example of how to vectorize. --- src/opencl/sha1_kernel.cl | 100 +++++++++++++++++++++++++------------------- src/opencl_rawsha1_fmt.c | 8 +++- 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/src/opencl/sha1_kernel.cl b/src/opencl/sha1_kernel.cl index 9a1ac9d..5baef73 100644 --- a/src/opencl/sha1_kernel.cl +++ b/src/opencl/sha1_kernel.cl @@ -9,8 +9,16 @@ This is free software, and you are welcome to redistribute it under certain conditions; as expressed here http://www.gnu.org/licenses/gpl-2.0.html + + vectorized by magnum in 2012 just for fun */ +// Just to make sure we don't accidentally add byte store +// uchar4 is not byte store! +#ifdef cl_khr_byte_addressable_store +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable +#endif + #define K0 0x5A827999 #define K1 0x6ED9EBA1 #define K2 0x8F1BBCDC @@ -22,20 +30,14 @@ #define H4 0x10325476 #define H5 0xC3D2E1F0 -#ifndef uint32_t -#define uint32_t unsigned int -#endif - - __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){ int t, gid, msg_pad; int i, stop, mmod; uint ulen; - uint W[16], temp, A,B,C,D,E; + uint4 W[16], temp, A,B,C,D,E; uint num_keys = data_info[1]; - gid = get_global_id(0); - msg_pad = gid * data_info[0]; + gid = 4 * get_global_id(0); A = H1; B = H2; @@ -46,50 +48,56 @@ __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_ke for (t = 1; t < 15; t++){ W[t] = 0x00000000; } - for(i = 0; i < data_info[0] && ((uchar) plain_key[msg_pad + i]) != 0x0 ; i++){ - } - stop = i / 4 ; - for (t = 0 ; t < stop ; t++){ - W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24; - W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16; - W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8; - W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3]; - } - mmod = i % 4; - if ( mmod == 3){ - W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24; - W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16; - W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8; - W[t] |= ((uchar) 0x80) ; - } else if (mmod == 2) { - W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24; - W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16; - W[t] |= 0x8000 ; - } else if (mmod == 1) { - W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24; - W[t] |= 0x800000 ; - } else if (mmod == 0){ - W[t] = 0x80000000 ; - } - ulen = (i * 8) & 0xFFFFFFFF; - W[15] = ulen ; +#define P(j) \ + msg_pad = (gid + j) * data_info[0]; \ + for(i = 0; i < data_info[0] && (plain_key[msg_pad + i]) != 0x0; i++); \ + stop = i / 4;\ + for (t = 0; t < stop; t++){\ + W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\ + W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\ + W[t].s##j |= (plain_key[msg_pad + t * 4 + 2]) << 8;\ + W[t].s##j |= plain_key[msg_pad + t * 4 + 3];\ + }\ + mmod = i % 4;\ + if ( mmod == 3){\ + W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\ + W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\ + W[t].s##j |= (plain_key[msg_pad + t * 4 + 2]) << 8;\ + W[t].s##j |= ((uchar) 0x80);\ + } else if (mmod == 2) {\ + W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\ + W[t].s##j |= (plain_key[msg_pad + t * 4 + 1]) << 16;\ + W[t].s##j |= 0x8000;\ + } else if (mmod == 1) {\ + W[t].s##j = (plain_key[msg_pad + t * 4]) << 24;\ + W[t].s##j |= 0x800000;\ + } else if (mmod == 0){\ + W[t].s##j = 0x80000000;\ + }\ + ulen = (i * 8) & 0xFFFFFFFF;\ + W[15].s##j = ulen; + P(0); + P(1); + P(2); + P(3); #undef R #define R(t) \ ( \ temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ \ W[(t - 14) & 0x0F] ^ W[ t & 0x0F], \ - ( W[t & 0x0F] = rotate((int)temp,1) ) \ + ( W[t & 0x0F] = rotate(temp,1) ) \ ) #undef P #define P(a,b,c,d,e,x) \ { \ - e += rotate((int)a,5) + F(b,c,d) + K + x; b = rotate((int)b,30);\ + e += rotate(a,5) + F(b,c,d) + K + x; b = rotate(b,30); \ } -#define F(x,y,z) (z ^ (x & (y ^ z))) +//#define F(x,y,z) (z ^ (x & (y ^ z))) +#define F(x, y, z) bitselect((z), (y), (x)) #define K 0x5A827999 P( A, B, C, D, E, W[0] ); @@ -196,10 +204,16 @@ __kernel void sha1_crypt_kernel(__global uint *data_info,__global char *plain_ke #undef K #undef F - digest[gid] = as_uint(as_uchar4(A + H1).wzyx); - digest[gid+1*num_keys] = as_uint(as_uchar4(B + H2).wzyx); - digest[gid+2*num_keys] = as_uint(as_uchar4(C + H3).wzyx); - digest[gid+3*num_keys] = as_uint(as_uchar4(D + H4).wzyx); - digest[gid+4*num_keys] = as_uint(as_uchar4(E + H5).wzyx); +#undef P +#define P(j) \ + digest[gid+j] = (A + H1).s##j; \ + digest[(gid+j)+1*num_keys] = (B + H2).s##j; \ + digest[(gid+j)+2*num_keys] = (C + H3).s##j;\ + digest[(gid+j)+3*num_keys] = (D + H4).s##j;\ + digest[(gid+j)+4*num_keys] = (E + H5).s##j + P(0); + P(1); + P(2); + P(3); } diff --git a/src/opencl_rawsha1_fmt.c b/src/opencl_rawsha1_fmt.c index 6b88ec8..4b25d75 100644 --- a/src/opencl_rawsha1_fmt.c +++ b/src/opencl_rawsha1_fmt.c @@ -52,7 +52,7 @@ cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_i static cl_uint *partial_hashes; static cl_uint *res_hashes; static char *saved_plain; -static size_t global_work_size = SHA_NUM_KEYS; +static size_t global_work_size = SHA_NUM_KEYS / 4; static unsigned int datai[2]; static int have_full_hashes; @@ -151,7 +151,7 @@ static void create_clobj(int kpc){ datai[0] = PLAINTEXT_LENGTH; datai[1] = kpc; - global_work_size = kpc; + global_work_size = kpc / 4; } static void release_clobj(void){ @@ -277,6 +277,10 @@ static void *binary(char *ciphertext){ atoi16[ARCH_INDEX(ciphertext[i * 2])] * 16 + atoi16[ARCH_INDEX(ciphertext[i * 2 + 1])]; } + /* We swap once here, and avoid billions of + swaps in the inner loop. */ + alter_endianity(realcipher, BINARY_SIZE); + return (void *) realcipher; } -- 1.7.5.4