Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Sun, 18 Mar 2012 08:11:54 -0300
From: Claudio André <claudioandre.br@...il.com>
To: john-dev@...ts.openwall.com
Subject: New patch for OpenCL SHA-512

Hi, attached.

It uses john.conf for LWS e KPC (if available), fix the format and 
algorithm name, etc.
It also uses fast memory to keep temporary buffers (improve performance).

Numbers here:
=> CPU
Benchmarking: crypt SHA-512 (rounds=5000) [OpenSSL 64/64]... DONE
Raw:    440 c/s real, 440 c/s virtual

=> OMP
Benchmarking: crypt SHA-512 (rounds=5000) [OpenSSL 64/64]... (6xOMP) DONE
Raw:    2254 c/s real, 378 c/s virtual

=> OpenCL CPU
Benchmarking: crypt SHA-512 (rounds=5000) [OpenCL]... DONE
Raw:    1422 c/s real, 237 c/s virtual

=> OpenCL GPU
Local work size (LWS) 64, Keys per crypt (KPC) 65536
Benchmarking: crypt SHA-512 (rounds=5000) [OpenCL]... DONE
Raw:    1228 c/s real, 936228 c/s virtual


Claudio


>From e5462c8b0a9e8c895900dad4d43e21d51512cb51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Claudio=20Andr=C3=A9?= <claudio.andre@...reios.net.br>
Date: Sun, 18 Mar 2012 07:58:35 -0300
Subject: [PATCH] Improvement: get the LWS and KPC from john's config file.
 Improvement: check if group size is valid. Cosmetic:
 changes in benchmark comment. See commit: 9d3fe1414e.
 Performance: put all working variables in __local memory
 address space. Fixed: allow temp buffer to use all the
 available local memory. Fixed: use small KPC values when
 running on CPU. On real cracking tasks, big values behaves
 badly. Fixed: format and algorithm names. Fixed: copyright
 information.

---
 run/john.conf                    |    4 +
 src/common-opencl.c              |   41 +++++++
 src/common-opencl.h              |    5 +
 src/opencl/cryptsha512_kernel.cl |  233 ++++++++++++++++++++------------------
 src/opencl_cryptsha512.h         |   35 +++++-
 src/opencl_cryptsha512_fmt.c     |  189 +++++++++++++++++++------------
 6 files changed, 320 insertions(+), 187 deletions(-)

diff --git a/run/john.conf b/run/john.conf
index 0304c5b..eda8ccf 100644
--- a/run/john.conf
+++ b/run/john.conf
@@ -81,6 +81,10 @@ Device = 0
 #ssha_LWS = 512
 #ssha_KPC = 8192
 
+# For Crypt sha-512.
+cryptsha512_LWS = 64
+cryptsha512_KPC = 8192
+
 
 # A user defined character class is named with a single digit, ie. 0..9. After
 # the equal-sign, just list all characters that this class should match. You
diff --git a/src/common-opencl.c b/src/common-opencl.c
index 96dfec2..76e3048 100644
--- a/src/common-opencl.c
+++ b/src/common-opencl.c
@@ -127,6 +127,47 @@ void opencl_init(char *kernel_filename, unsigned int dev_id,
 	build_kernel(dev_id);
 }
 
+cl_ulong get_local_memory_size(int dev_id)
+{    
+        cl_ulong size;        
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_LOCAL_MEM_SIZE,
+                sizeof(cl_ulong), &size, NULL),
+                "Error querying CL_DEVICE_LOCAL_MEM_SIZE");    
+        
+        return size;
+}
+
+size_t get_max_work_group_size(int dev_id)
+{          
+        size_t max_group_size;
+
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, 
+                       sizeof (max_group_size), &max_group_size, NULL),
+                       "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");  
+        
+        return max_group_size;
+}
+
+cl_uint get_max_compute_units(int dev_id)
+{    
+        cl_uint size;        
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_MAX_COMPUTE_UNITS,
+                sizeof(cl_uint), &size, NULL),
+                "Error querying CL_DEVICE_LOCAL_MEM_SIZE");    
+        
+        return size;
+}
+
+cl_device_type get_device_type(int dev_id)
+{    
+        cl_device_type type;        
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE,
+                sizeof(cl_device_type), &type, NULL),
+                "Error querying CL_DEVICE_LOCAL_MEM_SIZE");    
+        
+        return type;
+}
+
 char *get_error_name(cl_int cl_error)
 {
 	static char *err_1[] =
diff --git a/src/common-opencl.h b/src/common-opencl.h
index 44d408e..1ba03d0 100644
--- a/src/common-opencl.h
+++ b/src/common-opencl.h
@@ -36,6 +36,11 @@ size_t max_group_size;
 void opencl_init(char *kernel_filename, unsigned int dev_id,
                  unsigned int platform_id);
 
+cl_ulong get_local_memory_size(int dev_id);
+size_t get_max_work_group_size(int dev_id);
+cl_uint get_max_compute_units(int dev_id);
+cl_device_type get_device_type(int dev_id);
+
 char *get_error_name(cl_int cl_error);
 
 void handle_clerror(cl_int cl_error, const char *message, const char *file, int line);
diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl
index 95e1f12..c8a0c86 100644
--- a/src/opencl/cryptsha512_kernel.cl
+++ b/src/opencl/cryptsha512_kernel.cl
@@ -1,11 +1,17 @@
 /*
-* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
-* and it is hereby released to the general public under the following terms:
-* Redistribution and use in source and binary forms, with or without modification, are permitted.
-*/
+ * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
+ * Based on source code provided by Lukas Odzioba
+ *
+ * This software is:
+ * Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+ * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
+ * and it is hereby released to the general public under the following terms:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted.
+ * 
+ * This program comes with ABSOLUTELY NO WARRANTY; express or implied .
+ */
 
 #include "opencl_cryptsha512.h"
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
 
 __constant uint64_t k[] = {
     0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
@@ -30,7 +36,7 @@ __constant uint64_t k[] = {
     0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL,
 };
 
-void init_ctx(sha512_ctx * ctx) {
+void init_ctx(__local sha512_ctx * ctx) {
     ctx->H[0] = 0x6a09e667f3bcc908UL;
     ctx->H[1] = 0xbb67ae8584caa73bUL;
     ctx->H[2] = 0x3c6ef372fe94f82bUL;
@@ -43,24 +49,27 @@ void init_ctx(sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-void memcpy_1(uint8_t * dest, const uint8_t * src, const size_t n) {
+void memcpy_08(__local uint8_t * dest, __local const uint8_t * src, const size_t n) {
     for (int i = 0; i < n; i++)
         dest[i] = src[i];
 }
 
-void memcpy (uint8_t * dest, buffer_64 * src, const size_t n) {
+void memcpy_64(__local uint8_t * dest, __local buffer_64 * src, const size_t n) {
     for (int i = 0; i < n; i++)
         dest[i] = src->mem_08[i];
 }
 
-void insert_to_buffer(sha512_ctx * ctx, const uint8_t * string,
+void insert_to_buffer(__local sha512_ctx * ctx, 
+                      __local const uint8_t * string,
                       const uint8_t len) {
-    uint8_t *d = ctx->buffer->mem_08 + ctx->buflen;  //Position ctx->buffer[buflen] (in char size)
-    memcpy_1(d, string, len);
+    __local uint8_t *d;
+    d = ctx->buffer->mem_08 + ctx->buflen;  //ctx->buffer[buflen] (in char size)
+
+    memcpy_08(d, string, len);
     ctx->buflen += len;
 }
 
-void sha512_block(sha512_ctx * ctx) {
+void sha512_block(__local sha512_ctx * ctx) {
     int i;
     uint64_t a = ctx->H[0];
     uint64_t b = ctx->H[1];
@@ -73,13 +82,14 @@ void sha512_block(sha512_ctx * ctx) {
 
     uint64_t w[16];
 
-    uint64_t *data = ctx->buffer->mem_64;  //The same as buffer[0]
-    //#pragma unroll 16
+    __local uint64_t *data = ctx->buffer->mem_64;  //The same as buffer[0]
+
+    #pragma unroll 16
     for (i = 0; i < 16; i++)
         w[i] = SWAP64(data[i]);
 
     uint64_t t1, t2;
-    //#pragma unroll 16
+    #pragma unroll 16
     for (i = 0; i < 16; i++) {
         t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g);
         t2 = Maj(a, b, c) + Sigma0(a);
@@ -94,7 +104,7 @@ void sha512_block(sha512_ctx * ctx) {
         a = t1 + t2;
     }
 
-
+    #pragma unroll 64
     for (i = 16; i < 80; i++) {
         w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
         t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g);
@@ -120,51 +130,36 @@ void sha512_block(sha512_ctx * ctx) {
     ctx->H[7] += h;
 }
 
-void ctx_append_1(sha512_ctx * ctx) {
+void ctx_append_1(__local sha512_ctx * ctx) {
     uint32_t length = ctx->buflen;
     int i = 127 - length;
-    uint8_t *d = ctx->buffer->mem_08 + length;
+    __local uint8_t *d = ctx->buffer->mem_08 + length;
     *d++ = 0x80;
 
     while (i--) {
         d[i] = 0;
     }
-
-/* TODO: 
-     while(  length%4!=0)
-160     {  *d  =0;
-161     i--;
-162     }
-163     x=(uint32_t*)d;
-164     while(i>0)
-165     {  i-=4;
-166         *x  =0;
-167     }
-*/
 }
 
-void ctx_add_length(sha512_ctx * ctx) {
-    uint64_t *blocks = ctx->buffer->mem_64;
+void ctx_add_length(__local sha512_ctx * ctx) {
+    __local uint64_t *blocks = ctx->buffer->mem_64;
     blocks[15] = SWAP64((uint64_t) (ctx->total * 8));
 }
 
-void finish_ctx(sha512_ctx * ctx) {
+void finish_ctx(__local sha512_ctx * ctx) {
     ctx_append_1(ctx);
     ctx_add_length(ctx);
     ctx->buflen = 0;
 }
 
-void ctx_update(sha512_ctx * ctx, uint8_t *string, uint8_t len) {
+void ctx_update(__local sha512_ctx * ctx, __local uint8_t *string, uint8_t len) {
+
     ctx->total += len;
     uint8_t startpos = ctx->buflen;
-    uint8_t partsize;
-    if (startpos + len <= 128) {
-        partsize = len;
-    } else
-        partsize = 128 - startpos;
-
-    insert_to_buffer(ctx, string, partsize);
-    if (ctx->buflen == 128) {
+
+    insert_to_buffer(ctx, string, (startpos + len <= 128 ? len : 128 - startpos));
+
+    if (ctx->buflen == 128) {  //Branching.
         uint8_t offset = 128 - startpos;
         sha512_block(ctx);
         ctx->buflen = 0;
@@ -172,23 +167,25 @@ void ctx_update(sha512_ctx * ctx, uint8_t *string, uint8_t len) {
     }
 }
 
-void clear_ctx_buffer(sha512_ctx * ctx) {
+void clear_ctx_buffer(__local sha512_ctx * ctx) {
 
-    uint32_t *w = ctx->buffer->mem_32;
-    //#pragma unroll 30
-    for (int i = 0; i < 30; i++) //TODO: why 30? Not 32?
+    __local uint32_t *w = ctx->buffer->mem_32;
+
+    #pragma unroll 32
+    for (int i = 0; i < 32; i++)
         w[i] = 0;
 
     ctx->buflen = 0;
 }
 
-void sha512_digest(sha512_ctx * ctx, uint64_t * result) {
-    uint8_t i;
+void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) {
+
     if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block
         finish_ctx(ctx);
-        sha512_block(ctx);
+
     } else {
         uint8_t moved = 1;
+
         if (ctx->buflen < 128) { //data and 0x80 fits in one block
             ctx_append_1(ctx);
             moved = 0;
@@ -198,118 +195,138 @@ void sha512_digest(sha512_ctx * ctx, uint64_t * result) {
         if (moved)
             ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean
         ctx_add_length(ctx);
-        sha512_block(ctx);
     }
-    //#pragma unroll 8
-    for (i = 0; i < 8; i++)
+    sha512_block(ctx);
+
+    #pragma unroll 8
+    for (int i = 0; i < 8; i++)
         result[i] = SWAP64(ctx->H[i]);
 }
 
-void sha512crypt(uint8_t *pass, uint8_t passlength,
-                 crypt_sha512_salt cuda_salt, 
+void sha512crypt(__local working_memory * tmp_working,
                  __global crypt_sha512_hash * output) {
 
-    buffer_64 alt_result[8], temp_result[8];
-    int i;
-    sha512_ctx ctx;
+#define pass        tmp_working->pass_info.v
+#define passlength  tmp_working->pass_info.length
+#define salt        tmp_working->salt_info.salt
+#define saltlen     tmp_working->salt_info.saltlen
+#define rounds      tmp_working->salt_info.rounds
+#define alt_result  tmp_working->alt_result
+#define temp_result tmp_working->temp_result
+#define s_sequence  tmp_working->s_sequence
+#define p_sequence  tmp_working->p_sequence
+#define ctx         tmp_working->ctx_info
+
     init_ctx(&ctx);
 
     ctx_update(&ctx, pass, passlength);
-    ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+    ctx_update(&ctx, salt, saltlen);
     ctx_update(&ctx, pass, passlength);
 
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
     ctx_update(&ctx, pass, passlength);
-    ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+    ctx_update(&ctx, salt, saltlen);
     ctx_update(&ctx, alt_result->mem_08, passlength);
 
-    for (i = passlength; i > 0; i >>= 1) {
-        if ((i & 1) != 0)
-            ctx_update(&ctx, alt_result->mem_08, 64);
-        else
-            ctx_update(&ctx, pass, passlength);
+    for (int i = passlength; i > 0; i >>= 1) {
+        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : pass),
+                         ((i & 1) != 0 ? 64 :                 passlength));
     }
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
-    for (i = 0; i < passlength; i++)
+    for (int i = 0; i < passlength; i++)
         ctx_update(&ctx, pass, passlength);
 
     sha512_digest(&ctx, temp_result->mem_64);
-
-    uint8_t sp_sequence[16 + 4];
-    uint8_t *p_sequence = sp_sequence;
-    memcpy(p_sequence, temp_result, passlength);
+    memcpy_64(p_sequence, temp_result, passlength);
 
     init_ctx(&ctx);
     
     /* For every character in the password add the entire password.  */
-    for (i = 0; i < 16 + (alt_result->mem_08)[0]; i++)  //Analyse, TÁ CERTO?###
-        ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen);
+    for (int i = 0; i < 16 + (alt_result->mem_08)[0]; i++)
+        ctx_update(&ctx, salt, saltlen);
 
     /* Finish the digest.  */
     sha512_digest(&ctx, temp_result->mem_64);
-
-    uint8_t saltlength = cuda_salt.saltlen;
-
-    uint8_t ss_sequence[16 + 4];
-    uint8_t *s_sequence = ss_sequence;
-    memcpy(s_sequence, temp_result, saltlength);
+    memcpy_64(s_sequence, temp_result, saltlen);
 
     /* Repeatedly run the collected hash value through SHA512 to
        burn CPU cycles.  */
-    for (i = 0; i < cuda_salt.rounds; i++) {
+    for (int i = 0; i < rounds; i++) {
         init_ctx(&ctx);
 
-        if ((i & 1) != 0)
-            ctx_update(&ctx, p_sequence, passlength);
-        else
-            ctx_update(&ctx, alt_result->mem_08, 64);  
+        ctx_update(&ctx, ((i & 1) != 0 ? p_sequence : alt_result->mem_08),
+                         ((i & 1) != 0 ? passlength : 64)); 
 
         if ((i % 3) != 0)
-            ctx_update(&ctx, s_sequence, saltlength);
+            ctx_update(&ctx, s_sequence, saltlen);
 
         if ((i % 7) != 0)
             ctx_update(&ctx, p_sequence, passlength);
 
-        if ((i & 1) != 0)
-            ctx_update(&ctx, alt_result->mem_08, 64);  
-        else
-            ctx_update(&ctx, p_sequence, passlength);
-
+        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence),
+                         ((i & 1) != 0 ? 64 :                 passlength));
         sha512_digest(&ctx, alt_result->mem_64);
     }
     //Send results to the host.
-    //#pragma unroll 8
-    for (i = 0; i < 8; i++)
-        output->v[i] = alt_result[i].mem_64[0];
+    #pragma unroll 8
+    for (int i = 0; i < 8; i++)
+        output->v[i] = alt_result[i].mem_64[0];  
 }
+#undef salt       
+#undef saltlen    
+#undef rounds   
 
 __kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt,
-                           __constant crypt_sha512_password * inbuffer,
-                           __global   crypt_sha512_hash * outbuffer) {
-
-    uint8_t pass[PLAINTEXT_LENGTH];
-    crypt_sha512_salt salt_data;
+                           __global   crypt_sha512_password * inbuffer,
+                           __global   crypt_sha512_hash * outbuffer,
+                           __local    working_memory * tmp_memory) {
 
     //Get the task to be done
-    uint32_t idx = get_global_id(0);
+    uint32_t gid = get_global_id(0);
+    uint32_t lid = get_local_id(0);
 
-    //Use fast memory.
+    //Transfer data to faster memory
+    //Password information
+    tmp_memory[lid].pass_info.length = inbuffer[gid].length;
 
-    //Get password information, put in faster memory.
-    for (int i = 0; i < inbuffer[idx].length; i++)
-        pass[i] = inbuffer[idx].v[i]; 
-    
-    //Get salt information, put in faster memory.
-    salt_data.saltlen = hsalt->saltlen;
-    salt_data.rounds = hsalt->rounds;
+    #pragma unroll PLAINTEXT_LENGTH
+    for (int i = 0; i < PLAINTEXT_LENGTH; i++)
+        tmp_memory[lid].pass_info.v[i] = inbuffer[gid].v[i]; 
+ 
+    //Salt information. 
+    tmp_memory[lid].salt_info.saltlen = hsalt->saltlen;  
+    tmp_memory[lid].salt_info.rounds = hsalt->rounds;
 
-    for (int i = 0; i < salt_data.saltlen; i++)
-	salt_data.salt[i] = hsalt->salt[i];
+    #pragma unroll SALT_SIZE
+    for (int i = 0; i < SALT_SIZE; i++)
+	tmp_memory[lid].salt_info.salt[i] = hsalt->salt[i];
 
     //Do the job
-    sha512crypt(pass, inbuffer[idx].length, salt_data, &outbuffer[idx]);
+    sha512crypt(&tmp_memory[lid], &outbuffer[gid]);
 }
+
+/***
+*    To improve performance, it uses __local memory to keep working variables 
+* (password, temp buffers, etc). In SHA 512 it means about 350 bytes per 
+* "thread". It improves performance a lot, but, local memory is a scarce 
+* resource. 
+*    It means the max group size allowed in OpenCL SHA 512 is going to be 
+* 128 (hardware depended).
+*
+* Gain   Optimizations
+*  --    Basic version, private and global variables only.
+*        Transfer all the working variables to local memory.
+* -10%   Move salt to constant memory space. Keep others in local (saves memory).IGNORED.
+*  25%   Unrool main loops.
+*   5%   Unrool other loops.
+* -INF   Do the compare task on GPU.
+*
+* Conclusions
+* - Compare on GPU: CPU is more efficient for now.
+* - Salt on constant memory is not good enought.
+* - No register spilling happens after optimization.
+***/
diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h
index f8f1852..c0b5f6a 100644
--- a/src/opencl_cryptsha512.h
+++ b/src/opencl_cryptsha512.h
@@ -1,12 +1,20 @@
 /*
-* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
-* and it is hereby released to the general public under the following terms:
-* Redistribution and use in source and binary forms, with or without modification, are permitted.
-*/
+ * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
+ * Based on source code provided by Lukas Odzioba
+ *
+ * This software is:
+ * Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+ * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
+ * and it is hereby released to the general public under the following terms:
+ * Redistribution and use in source and binary forms, with or without modification, are permitted.
+ * 
+ * This program comes with ABSOLUTELY NO WARRANTY; express or implied .
+ */
+
 #ifndef _CRYPTSHA512_H 
 #define _CRYPTSHA512_H
 
-//Type names definition. ///TODO: move to a new file and share this new file where needed.
+//Type names definition. 
 #define uint8_t  unsigned char
 #define uint16_t unsigned short
 #define uint32_t unsigned int
@@ -22,7 +30,12 @@
 
 #define SALT_SIZE               16
 #define PLAINTEXT_LENGTH        16     
-#define KEYS_PER_CRYPT          1024*2048
+#define BINARY_SIZE             (3+16+86)       ///TODO: Magic number?
+
+#define KEYS_PER_CORE_CPU       512
+#define KEYS_PER_CORE_GPU       1024
+#define MIN_KEYS_PER_CRYPT	128
+#define MAX_KEYS_PER_CRYPT	2048*2048*128
 
 #define rol(x,n)                ((x << n) | (x >> (64-n)))
 #define ror(x,n)                ((x >> n) | (x << (64-n)))
@@ -76,4 +89,14 @@ typedef struct {
 	uint64_t v[8];		//512 bits
 } crypt_sha512_hash;
 
+typedef struct {
+        crypt_sha512_password  pass_info;
+        crypt_sha512_salt      salt_info;
+        sha512_ctx             ctx_info;
+        buffer_64              alt_result[8];
+        buffer_64              temp_result[8];
+        uint8_t                s_sequence[SALT_SIZE];
+        uint8_t                p_sequence[PLAINTEXT_LENGTH];
+} working_memory;
+
 #endif
\ No newline at end of file
diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c
index dfaa949..2f49260 100644
--- a/src/opencl_cryptsha512_fmt.c
+++ b/src/opencl_cryptsha512_fmt.c
@@ -1,8 +1,10 @@
 /*
- * Copyright (c) 2011 Samuele Giovanni Tonon
- * samu at linuxasylum dot net
- * This program comes with ABSOLUTELY NO WARRANTY; express or
- * implied .
+ * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
+ * Based on source code provided by Samuele Giovanni Tonon
+ *
+ * Copyright (c) 2011 Samuele Giovanni Tonon <samu at linuxasylum dot net>
+ * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
+ * This program comes with ABSOLUTELY NO WARRANTY; express or implied .
  * This is free software, and you are welcome to redistribute it
  * under certain conditions; as expressed here 
  * http://www.gnu.org/licenses/gpl-2.0.html
@@ -10,24 +12,19 @@
 
 #include <string.h>
 #include "common-opencl.h"  
+#include "config.h"
 #include "opencl_cryptsha512.h"
 
-#define FORMAT_LABEL			"cryptsha512-opencl"
-#define FORMAT_NAME			"crypt SHA-512 OpenCL"
+#define FORMAT_LABEL			"cryptsha512-opencl" 
+#define FORMAT_NAME			"crypt SHA-512"
+#define ALGORITHM_NAME			"OpenCL"
+#define SHA_TYPE                        "SHA512"
 
-#if ARCH_BITS >= 64
-#define ALGORITHM_NAME			"OpenSSL 64/" ARCH_BITS_STR
-#else
-#define ALGORITHM_NAME			"OpenSSL 32/" ARCH_BITS_STR
-#endif
-
-#define BENCHMARK_COMMENT		" rounds=5000"
+#define BENCHMARK_COMMENT		" (rounds=5000)"
 #define BENCHMARK_LENGTH		-1
 
-#define BINARY_SIZE                     (3+16+86)       ///TODO: Magic number?
-
-#define MIN_KEYS_PER_CRYPT		1024            
-#define MAX_KEYS_PER_CRYPT		KEYS_PER_CRYPT
+#define LWS_CONFIG			"cryptsha512_LWS"
+#define KPC_CONFIG			"cryptsha512_KPC"
 
 static crypt_sha512_password            *plaintext;     // plaintext ciphertexts
 static crypt_sha512_hash                *out_hashes;    // calculated hashes
@@ -41,7 +38,7 @@ cl_mem pinned_saved_keys, pinned_partial_hashes;
 cl_command_queue queue_prof;
 cl_kernel crypt_kernel;
 
-static size_t max_keys_per_crypt = KEYS_PER_CRYPT;
+static size_t max_keys_per_crypt; //TODO: move to common-opencl? local_work_size is there.
 
 static struct fmt_tests tests[] = {
     {"$6$saltstring$svn8UoSVapNtMuq1ukKS4tPQd8iKwSMHWjl/O817G3uBnIFNjnQJuesI68u4OTLiBFdcbYEdFCoEOfaS35inz1", "Hello world!"},
@@ -52,6 +49,27 @@ static struct fmt_tests tests[] = {
     {NULL}
 }; 
 
+/* ------- Helper functions ------- */
+uint get_task_max_work_group_size(){
+    uint max_available;
+    max_available = get_local_memory_size(gpu_id) / sizeof(working_memory);
+    
+    if (max_available > get_max_work_group_size(gpu_id))
+        return get_max_work_group_size(gpu_id);
+    
+    return max_available;
+}
+
+uint get_task_max_size(){ 
+    uint max_available;
+    max_available = get_max_compute_units(gpu_id);
+
+    if (get_device_type(gpu_id) == CL_DEVICE_TYPE_CPU)
+        return max_available * KEYS_PER_CORE_CPU;
+    
+    return max_available * KEYS_PER_CORE_GPU;
+}
+
 /* ------- Create and destroy necessary objects ------- */
 static void create_clobj(int kpc) {           
     pinned_saved_keys = clCreateBuffer(context[gpu_id], 
@@ -78,7 +96,7 @@ static void create_clobj(int kpc) {
     salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
             sizeof(crypt_sha512_salt), NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating data_info out argument");
-    
+     
     buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
             sizeof(crypt_sha512_password) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");
@@ -93,8 +111,11 @@ static void create_clobj(int kpc) {
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem),
             (void *) &buffer_in), "Error setting argument 1");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem),
-            (void *) &buffer_out), "Error setting argument 2");
-    
+            (void *) &buffer_out), "Error setting argument 2");     
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3,   //Fast working memory.
+            sizeof (working_memory) * local_work_size,
+            NULL), "Error setting argument 3");   
+ 
     memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc);
     salt_data.saltlen = 0;
     salt_data.rounds = 0;
@@ -143,7 +164,12 @@ static char *get_key(int index) {
 /* ------- Try to find the best configuration ------- */
 /* --
   This function could be used to calculated the best num
-  of keys per crypt for the given format
+  for the workgroup
+  Work-items that make up a work-group (also referred to 
+  as the size of the work-group) 
+  LWS should never be a big number since every work-item
+  uses about 400 bytes of local memory. Local memory 
+  is usually 32 KB
 -- */
 static void find_best_workgroup(void) {
     cl_event myEvent;
@@ -153,24 +179,25 @@ static void find_best_workgroup(void) {
     int i;
     size_t max_group_size;
 
-    clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, 
-            sizeof (max_group_size), &max_group_size, NULL);
+    max_group_size = get_max_work_group_size(gpu_id);
     queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], 
             CL_QUEUE_PROFILING_ENABLE, &ret_code);
     printf("Max Group Work Size %d ", (int) max_group_size);
     local_work_size = 1;
 
     // Set keys
-    for (i = 0; i < KEYS_PER_CRYPT; i++) {
+    for (i = 0; i < get_task_max_size(); i++) {
         set_key("aaabaabaaa", i);
     }
     clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0,
             sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL);
     clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
-            sizeof (crypt_sha512_password) * KEYS_PER_CRYPT, plaintext, 0, NULL, NULL);
+            sizeof (crypt_sha512_password) * get_task_max_size(), 
+            plaintext, 0, NULL, NULL);
 
     // Find minimum time
-    for (my_work_group = 1; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) {
+    for (my_work_group = 1; (int) my_work_group <= (int) get_task_max_work_group_size(); 
+         my_work_group *= 2) {
         ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
                 1, NULL, &max_keys_per_crypt, &my_work_group, 0, NULL, &myEvent);
         clFinish(queue_prof);
@@ -184,14 +211,17 @@ static void find_best_workgroup(void) {
                 sizeof (cl_ulong), &startTime, NULL);
         clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
                 sizeof (cl_ulong), &endTime, NULL);
-
+        clReleaseEvent (myEvent);
+        
         if ((endTime - startTime) < kernelExecTimeNs) {
             kernelExecTimeNs = endTime - startTime;
             local_work_size = my_work_group;
         }
     }
     printf("Optimal local work size %d\n", (int) local_work_size);
-    printf("(to avoid this test on next run do export LWS=%d)\n", (int) local_work_size);
+    printf("(to avoid this test on next run, put \""
+        LWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
+        SUBSECTION_OPENCL "])\n", (int)local_work_size);    
     clReleaseCommandQueue(queue_prof);
 }
 
@@ -200,10 +230,10 @@ static void find_best_workgroup(void) {
   of keys per crypt for the given format
 -- */
 static void find_best_kpc(void) {
-    int num;
+    size_t num;
     cl_event myEvent;
     cl_ulong startTime, endTime, tmpTime;
-    int kernelExecTimeNs = 6969;
+    cl_ulong kernelExecTimeNs = CL_ULONG_MAX;
     cl_int ret_code;
     int optimal_kpc = MIN_KEYS_PER_CRYPT;
     int i;
@@ -211,26 +241,28 @@ static void find_best_kpc(void) {
 
     printf("Calculating best keys per crypt, this will take a while ");
     
-    for (num = MAX_KEYS_PER_CRYPT; num > MIN_KEYS_PER_CRYPT; num -= 4096) {
+    for (num = get_task_max_size(); (int) num > MIN_KEYS_PER_CRYPT; num -= 4096) {
         release_clobj();
         create_clobj(num);
         advance_cursor();
+        tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num);
         queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], 
                 CL_QUEUE_PROFILING_ENABLE, &ret_code);
-      
+
         // Set keys
         for (i = 0; i < num; i++) {
             set_key("aaabaabaaa", i);
         }
         clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
                 sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL);
-        clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
-                sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL);
-           
+        clEnqueueWriteBuffer(queue_prof, buffer_in, CL_FALSE, 0, 
+                sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL); 
         ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
-                1, NULL, &max_keys_per_crypt, &local_work_size, 0, NULL, &myEvent);
+                1, NULL, &num, &local_work_size, 0, NULL, &myEvent);
+        clEnqueueReadBuffer(queue_prof, buffer_out, CL_FALSE, 0,
+                sizeof (crypt_sha512_hash) * num, tmpbuffer, 0, NULL, NULL);
         clFinish(queue_prof);
-                
+            
         if (ret_code != CL_SUCCESS) {
             printf("Error %d\n", ret_code);
             continue;
@@ -240,17 +272,9 @@ static void find_best_kpc(void) {
         clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
                 sizeof (cl_ulong), &endTime, NULL);
         
+        clReleaseEvent (myEvent);
         tmpTime = endTime - startTime;
-        tmpbuffer = malloc(sizeof (cl_uint) * num);
-        
-        clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, 
-                sizeof (cl_uint) * num, tmpbuffer, 0, NULL, &myEvent);
-        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, 
-                sizeof (cl_ulong), &startTime, NULL);
-        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
-                sizeof (cl_ulong), &endTime, NULL);
-        tmpTime = tmpTime + (endTime - startTime);
-        
+
         if (((int) (((float) (tmpTime) / num) * 10)) <= kernelExecTimeNs) {
             kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10));
             optimal_kpc = num;
@@ -259,8 +283,9 @@ static void find_best_kpc(void) {
         clReleaseCommandQueue(queue_prof);
     }
     printf("Optimal keys per crypt %d\n", optimal_kpc);
-    printf("(to avoid this test on next run do \"export KPC=%d\")\n", optimal_kpc);
-
+    printf("to avoid this test on next run, put \""
+        KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
+        SUBSECTION_OPENCL "])\n", optimal_kpc);
     max_keys_per_crypt = optimal_kpc;
     release_clobj();
     create_clobj(optimal_kpc);
@@ -268,36 +293,54 @@ static void find_best_kpc(void) {
 
 /* ------- Initialization  ------- */
 static void init(struct fmt_main *pFmt) {
-    char *kpc;
+    char *tmp_value;
     opencl_init("$JOHN/cryptsha512_kernel.cl", gpu_id, platform_id);
+    max_keys_per_crypt = get_task_max_size();
+    local_work_size = 0;
 
     // create kernel to execute
     crypt_kernel = clCreateKernel(program[gpu_id], "kernel_crypt", &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
 
-    if (((kpc = getenv("LWS")) == NULL) || (atoi(kpc) == 0)) {
-        create_clobj(KEYS_PER_CRYPT);
+    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
+                                   SUBSECTION_OPENCL, LWS_CONFIG)))
+        local_work_size = atoi(tmp_value);
+    
+    if ((tmp_value = getenv("LWS")))
+        local_work_size = atoi(tmp_value);
+
+    //Check if local_work_size is a valid number.
+    if (local_work_size > get_task_max_work_group_size()){
+        printf("Error: invalid local work size (LWS). Max value allowed is: %u\n" ,
+               get_task_max_work_group_size());
+        local_work_size = 0; //Force find a valid number.
+    }
+    
+    if (!local_work_size) {
+        local_work_size = get_task_max_work_group_size();
+        create_clobj(max_keys_per_crypt);
         find_best_workgroup();
         release_clobj();
-    } else {
-        local_work_size = atoi(kpc);
     }
-    if ((kpc = getenv("KPC")) == NULL) {
-        max_keys_per_crypt = KEYS_PER_CRYPT;
-        create_clobj(KEYS_PER_CRYPT);
-    } else {
-        if (atoi(kpc) == 0) {
-            //user chose to die of boredom
-            max_keys_per_crypt = KEYS_PER_CRYPT;
-            create_clobj(KEYS_PER_CRYPT); 
-            find_best_kpc();
-        } else {
-            max_keys_per_crypt = atoi(kpc);
-            create_clobj(max_keys_per_crypt);
-        }
+
+    if ((tmp_value = cfg_get_param(SECTION_OPTIONS, 
+                                   SUBSECTION_OPENCL, KPC_CONFIG)))
+        max_keys_per_crypt = atoi(tmp_value);
+
+    if ((tmp_value = getenv("KPC")))
+        max_keys_per_crypt = atoi(tmp_value);
+    
+    if (max_keys_per_crypt)
+        create_clobj(max_keys_per_crypt);
+
+    else {
+        //user chose to die of boredom
+        max_keys_per_crypt = get_task_max_size();
+        create_clobj(max_keys_per_crypt);
+        find_best_kpc();
     }
     printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", 
-            (int) local_work_size, max_keys_per_crypt);   
+           (int) local_work_size, max_keys_per_crypt);   
     pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
 }
 
@@ -365,7 +408,7 @@ static void set_salt(void *salt) {
         }
         offset = endp - currentsalt;
     }
-    memcpy(salt_data.salt, currentsalt + offset, 16);
+    memcpy(salt_data.salt, currentsalt + offset, SALT_SIZE);
     salt_data.saltlen = strlen((char *) salt_data.salt);
 }
 
@@ -469,7 +512,7 @@ static void crypt_all(int count) {
 
     //Read back hashes
     HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0,
-            sizeof (crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL),
+            sizeof(crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL),
             "failed in reading data back");
  
     //Do the work
@@ -477,7 +520,7 @@ static void crypt_all(int count) {
 }
 
 /* ------- Binary Hash functions group ------- */
-static int binary_hash_0(void * binary) { return *(ARCH_WORD_32 *) binary & 0xF; }
+static int binary_hash_0(void * binary) { return *(ARCH_WORD_32 *) binary & 0xF; } 
 static int binary_hash_1(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF; }
 static int binary_hash_2(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFF; }
 static int binary_hash_3(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFFF; }
@@ -545,4 +588,4 @@ struct fmt_main fmt_opencl_cryptsha512 = {
         cmp_one,
         cmp_exact
     }
-};
+};
\ No newline at end of file
-- 
1.7.5.4


Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ