Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Wed, 18 Apr 2012 11:45:32 -0300
From: Claudio André <claudioandre.br@...il.com>
To: john-dev@...ts.openwall.com
Subject: Re: SHA-512 OpenCL (was: OpenCL tests on HD 7970)

Hi, i can't reproduce here the errors Solar is getting.

Someone (i see Solar message) has hardware and time to help? We will 
make some tests in order to find out what is wrong in these boards.

Thanks.


1. i played a lot with the source code here. I agree it is slow, but it 
works on CPU and GPU, on self-tests and real cracking.
2. attached a new patch (it includes the new method of getWorkGroupSize 
information) as discussed in the list.
3. i'm more interested in new 7xxx AMD boards and Nvidia ones. Seems to 
be the best place to look for errors.

PS: magnum, do you have NVIDIA, right? Does it work in your hardware?


>From 1fb1b372eee0a37799cdc5e6511402be4f562267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Claudio=20Andr=C3=A9?= <claudio.andre@...reios.net.br>
Date: Wed, 18 Apr 2012 11:29:13 -0300
Subject: [PATCH] Minor improvements

---
 src/opencl/cryptsha512_kernel.cl |  230 +++++++++++++++++++++++---------------
 src/opencl_cryptsha512.h         |   57 +++++-----
 src/opencl_cryptsha512_fmt.c     |  144 +++++++++++++-----------
 3 files changed, 248 insertions(+), 183 deletions(-)

diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl
index c3e7ca9..be0ddcd 100644
--- a/src/opencl/cryptsha512_kernel.cl
+++ b/src/opencl/cryptsha512_kernel.cl
@@ -49,23 +49,67 @@ void init_ctx(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-inline void memcpy_08(__local uint8_t * dest, __local const uint8_t * src, const size_t n) {
+inline void memcpy(__local uint8_t       * dest, 
+                   __local const uint8_t * src, const size_t n) {
     for (int i = 0; i < n; i++)
         dest[i] = src[i];
 }
 
-inline void memcpy_64(__local uint8_t * dest, __local buffer_64 * src, const size_t n) {
-    for (int i = 0; i < n; i++)
-        dest[i] = src->mem_08[i];
+inline bool is_not_divisible_by_3(int n) {
+#ifndef SLOW_MODULO
+    return ((n % 3) != 0);
+
+#else
+    int sum;
+
+    do
+    {
+        sum = 0;
+
+        while(n)
+        {
+            sum += n & 3;
+            n = n >> 2;
+        }
+        n = sum;
+    } while(sum > 3);
+
+    //Result to send back.
+    return !(sum == 3 || sum == 0);
+#endif
 }
 
-void insert_to_buffer(__local sha512_ctx * ctx, 
+inline bool is_not_divisible_by_7(int n) {
+#ifndef SLOW_MODULO
+    return ((n % 7) != 0);
+
+#else
+    int sum;
+
+    do
+    {
+        sum = 0;
+
+        while(n)
+        {
+            sum += n & 7;
+            n = n >> 3;
+        }
+        n = sum;
+    } while(sum > 7);
+
+    //Result to send back.
+    return !(sum == 7 || sum == 0);
+#endif
+}
+
+void insert_to_buffer(__local sha512_ctx    * ctx, 
                       __local const uint8_t * string,
                       const uint8_t len) {
     __local uint8_t *d;
     d = ctx->buffer->mem_08 + ctx->buflen;  //ctx->buffer[buflen] (in char size)
 
-    memcpy_08(d, string, len);
+    memcpy(d, string, len);
     ctx->buflen += len;
 }
 
@@ -77,36 +121,29 @@ void sha512_block(__local sha512_ctx * ctx) {
     uint64_t e = ctx->H[4];
     uint64_t f = ctx->H[5];
     uint64_t g = ctx->H[6];
-    uint64_t h = ctx->H[7];
-
+    uint64_t h = ctx->H[7];    
+    uint64_t t1, t2;
     uint64_t w[16];
 
+#ifdef DEVICE_IS_CPU
     #pragma unroll 16
     for (int i = 0; i < 16; i++)
         w[i] = SWAP64(ctx->buffer->mem_64[i]);
-
-    uint64_t t1, t2;
-    #pragma unroll 16
-    for (int i = 0; i < 16; i++) {
-        t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g);
-        t2 = Maj(a, b, c) + Sigma0(a);
-
-        h = g;
-        g = f;
-        f = e;
-        e = d + t1;
-        d = c;
-        c = b;
-        b = a;
-        a = t1 + t2;
-    }
-
-    #pragma unroll 64
-    for (int i = 16; i < 80; i++) {
-        w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
+#else
+    ulong16  w_vector;
+    w_vector = vload16(0, ctx->buffer->mem_64); 
+    w_vector = SWAP64(w_vector);
+    vstore16(w_vector, 0, w);
+#endif
+
+    #pragma unroll 80
+    for (int i = 0; i < 80; i++) {
+
+        if (i > 15) {
+            w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
+        }
         t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g);
         t2 = Maj(a, b, c) + Sigma0(a);
-
         h = g;
         g = f;
         f = e;
@@ -128,9 +165,9 @@ void sha512_block(__local sha512_ctx * ctx) {
 }
 
 void ctx_append_1(__local sha512_ctx * ctx) {
-    uint32_t length = ctx->buflen;
-    int i = 127 - length;
-    __local uint8_t *d = ctx->buffer->mem_08 + length;
+    int i = 127 - ctx->buflen;
+    __local uint8_t * d = ctx->buffer->mem_08 + ctx->buflen;
+
     *d++ = 0x80;
 
     while (i--) {
@@ -149,7 +186,8 @@ void finish_ctx(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-void ctx_update(__local sha512_ctx * ctx, __local uint8_t *string, uint8_t len) {
+void ctx_update(__local sha512_ctx * ctx, 
+                __local uint8_t    * string, uint8_t len) {
 
     ctx->total += len;
     uint8_t startpos = ctx->buflen;
@@ -175,20 +213,22 @@ void clear_ctx_buffer(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) {
+void sha512_digest(__local  sha512_ctx * ctx, 
+                   __local  uint64_t   * result) {
 
     if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block
         finish_ctx(ctx);
 
     } else {
-        uint8_t moved = 1;
+        bool moved = true;
 
         if (ctx->buflen < 128) { //data and 0x80 fits in one block
             ctx_append_1(ctx);
-            moved = 0;
+            moved = false;
         }
         sha512_block(ctx);
         clear_ctx_buffer(ctx);
+
         if (moved)
             ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean
         ctx_add_length(ctx);
@@ -200,72 +240,69 @@ void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) {
         result[i] = SWAP64(ctx->H[i]);
 }
 
-void sha512crypt(__local working_memory * tmp_working,
+void sha512crypt(__local  working_memory    * fast_tmp_memory, 
+                 __local  crypt_sha512_salt * salt_data,                  
                  __global crypt_sha512_hash * output) {
 
-#define pass        tmp_working->pass_info.v
-#define passlength  tmp_working->pass_info.length
-#define salt        tmp_working->salt_info.salt
-#define saltlen     tmp_working->salt_info.saltlen
-#define rounds      tmp_working->salt_info.rounds
-#define alt_result  tmp_working->alt_result
-#define temp_result tmp_working->temp_result
-#define s_sequence  tmp_working->s_sequence
-#define p_sequence  tmp_working->p_sequence
-#define ctx         tmp_working->ctx_info
+#define pass        fast_tmp_memory->pass_data.pass
+#define passlen     fast_tmp_memory->pass_data.length
+#define salt        salt_data->salt
+#define saltlen     salt_data->length
+#define rounds      salt_data->rounds
+#define alt_result  fast_tmp_memory->alt_result
+#define temp_result fast_tmp_memory->temp_result
+#define p_sequence  fast_tmp_memory->p_sequence
+#define ctx         fast_tmp_memory->ctx_data
 
     init_ctx(&ctx);
 
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
     ctx_update(&ctx, salt, saltlen);
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
 
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
     ctx_update(&ctx, salt, saltlen);
-    ctx_update(&ctx, alt_result->mem_08, passlength);
+    ctx_update(&ctx, alt_result->mem_08, passlen);
 
-    for (int i = passlength; i > 0; i >>= 1) {
+    for (int i = passlen; i > 0; i >>= 1) {
         ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : pass),
-                         ((i & 1) != 0 ? 64 :                 passlength));
+                         ((i & 1) != 0 ? 64 :                 passlen));
     }
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
-    for (int i = 0; i < passlength; i++)
-        ctx_update(&ctx, pass, passlength);
-
-    sha512_digest(&ctx, temp_result->mem_64);
-    memcpy_64(p_sequence, temp_result, passlength);
+    for (int i = 0; i < passlen; i++)
+        ctx_update(&ctx, pass, passlen);
 
+    sha512_digest(&ctx, p_sequence->mem_64);
     init_ctx(&ctx);
     
-    /* For every character in the password add the entire password.  */
+    /* For every character in the password add the entire password. */
     for (int i = 0; i < 16 + (alt_result->mem_08)[0]; i++)
         ctx_update(&ctx, salt, saltlen);
 
     /* Finish the digest.  */
     sha512_digest(&ctx, temp_result->mem_64);
-    memcpy_64(s_sequence, temp_result, saltlen);
 
     /* Repeatedly run the collected hash value through SHA512 to
        burn CPU cycles.  */
     for (int i = 0; i < rounds; i++) {
         init_ctx(&ctx);
 
-        ctx_update(&ctx, ((i & 1) != 0 ? p_sequence : alt_result->mem_08),
-                         ((i & 1) != 0 ? passlength : 64)); 
+        ctx_update(&ctx, ((i & 1) != 0 ? p_sequence->mem_08 : alt_result->mem_08),
+                         ((i & 1) != 0 ? passlen : 64)); 
 
-        if ((i % 3) != 0)
-            ctx_update(&ctx, s_sequence, saltlen);
+        if (is_not_divisible_by_3(i))
+            ctx_update(&ctx, temp_result->mem_08, saltlen);
 
-        if ((i % 7) != 0)
-            ctx_update(&ctx, p_sequence, passlength);
+        if (is_not_divisible_by_7(i))
+            ctx_update(&ctx, p_sequence->mem_08, passlen);
 
-        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence),
-                         ((i & 1) != 0 ? 64 :                 passlength));
+        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence->mem_08),
+                          ((i & 1) != 0 ? 64 :                 passlen));
         sha512_digest(&ctx, alt_result->mem_64);
     }
     //Send results to the host.
@@ -276,54 +313,67 @@ void sha512crypt(__local working_memory * tmp_working,
 #undef salt       
 #undef saltlen    
 #undef rounds   
+#undef pass
 
-__kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt,
-                           __global   crypt_sha512_password * inbuffer,
-                           __global   crypt_sha512_hash * outbuffer,
-                           __local    working_memory * tmp_memory) {
+__kernel void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
+                           __global   crypt_sha512_password * pass_data,
+                           __global   crypt_sha512_hash   * out_buffer,
+                           __local    crypt_sha512_salt   * salt_data,
+                           __local    working_memory      * fast_tmp_memory) {
 
     //Get the task to be done
-    uint32_t gid = get_global_id(0);
-    uint32_t lid = get_local_id(0);
+    size_t gid = get_global_id(0);
+    size_t lid = get_local_id(0);
 
     //Transfer data to faster memory
     //Password information
-    tmp_memory[lid].pass_info.length = inbuffer[gid].length;
+    fast_tmp_memory[lid].pass_data.length = pass_data[gid].length;
 
     #pragma unroll PLAINTEXT_LENGTH
     for (int i = 0; i < PLAINTEXT_LENGTH; i++)
-        tmp_memory[lid].pass_info.v[i] = inbuffer[gid].v[i]; 
+        fast_tmp_memory[lid].pass_data.pass[i] = pass_data[gid].pass[i]; 
  
-    //Salt information. 
-    tmp_memory[lid].salt_info.saltlen = hsalt->saltlen;  
-    tmp_memory[lid].salt_info.rounds = hsalt->rounds;
-
-    #pragma unroll SALT_SIZE
-    for (int i = 0; i < SALT_SIZE; i++)
-	tmp_memory[lid].salt_info.salt[i] = hsalt->salt[i];
+    if (lid == 0){
+        //Copy salt information to fast local memory. Only once in a group.
+        salt_data->length = informed_salt->length;  
+        salt_data->rounds = informed_salt->rounds;
+
+        #pragma unroll SALT_SIZE
+        for (int i = 0; i < SALT_SIZE; i++)
+            salt_data->salt[i] = informed_salt->salt[i];
+    }
 
     //Do the job
-    sha512crypt(&tmp_memory[lid], &outbuffer[gid]);
+    sha512crypt(&fast_tmp_memory[lid], salt_data, &out_buffer[gid]);
 }
 
 /***
 *    To improve performance, it uses __local memory to keep working variables 
 * (password, temp buffers, etc). In SHA 512 it means about 350 bytes per 
-* "thread". It improves performance a lot, but, local memory is a scarce 
+* "thread". It improves performance, but, local memory is a scarce 
 * resource. 
 *    It means the max group size allowed in OpenCL SHA 512 is going to be 
-* 128 (hardware depended).
+* 64 (it depends on hardware local memory size).
 *
 * Gain   Optimizations
 *  --    Basic version, private and global variables only.
 *        Transfer all the working variables to local memory.
-* -10%   Move salt to constant memory space. Keep others in local (saves memory).IGNORED.
+* -10%   Move salt to constant memory space. Keep others in local (saves memory). BAD.
 *  25%   Unrool main loops.
 *   5%   Unrool other loops.
-* -INF   Do the compare task on GPU.
+*  ###   Do the compare task on GPU.
+*   5%   Remove some unecessary code.
+*  ###   Move almost everything to global and local memory. BAD.
+*   1%   Use vector types in SHA_Block in some variables. 
 *
 * Conclusions
 * - Compare on GPU: CPU is more efficient for now.
 * - Salt on constant memory is not good enought.
-* - No register spilling happens after optimization.
-***/
+* - No register spilling happens after optimization. Although, might need to use less registers.
+* - Tried to use "only" local and global memory. Got register spilling again.
+* - Vectorized do not give better performance, but result in less instructions.
+*   In reality, I'm not doing vector operations (doing the same thing in n bytes), 
+*   so should not expect big gains anyway.
+*   If i have a lot of memory, i might solve more than one hash at once 
+*   (and use more vectors). But it is not possible (at least for a while).
+***/
\ No newline at end of file
diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h
index e920c6e..7799c0e 100644
--- a/src/opencl_cryptsha512.h
+++ b/src/opencl_cryptsha512.h
@@ -18,7 +18,7 @@
 #define uint8_t  unsigned char
 #define uint16_t unsigned short
 #define uint32_t unsigned int
-#define uint64_t ulong  //Tip: unsigned long long int failed on compile (AMD).
+#define uint64_t unsigned long  //Tip: unsigned long long int failed on compile (AMD).
 
 //Functions.
 #define MAX(x,y)                ((x) > (y) ? (x) : (y))
@@ -29,7 +29,7 @@
 #define ROUNDS_MAX              999999999
 
 #define SALT_SIZE               16
-#define PLAINTEXT_LENGTH        16     
+#define PLAINTEXT_LENGTH        16
 #define BINARY_SIZE             (3+16+86)       ///TODO: Magic number?
 
 #define KEYS_PER_CORE_CPU       512
@@ -37,8 +37,8 @@
 #define MIN_KEYS_PER_CRYPT	128
 #define MAX_KEYS_PER_CRYPT	2048*2048*128
 
-#define rol(x,n)                rotate(x,n) 
-#define ror(x,n)                rotate(x, (ulong) 64-n)
+#define rol(x,n)                rotate(x, n) 
+#define ror(x,n)                rotate(x, (uint64_t) 64-n)
 #define Ch(x,y,z)               ((x & y) ^ ( (~x) & z))
 #define Maj(x,y,z)              ((x & y) ^ (x & z) ^ (y & z))
 #define Sigma0(x)               ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39)))
@@ -46,7 +46,7 @@
 #define sigma0(x)               ((ror(x,1))  ^ (ror(x,8))  ^ (x>>7))
 #define sigma1(x)               ((ror(x,19)) ^ (ror(x,61)) ^ (x>>6))
 
-# define SWAP64(n) \
+#define SWAP64(n) \
   (((n) << 56)					\
    | (((n) & 0xff00) << 40)			\
    | (((n) & 0xff0000) << 24)			\
@@ -58,42 +58,39 @@
 
 //Data types.
 typedef union {
-    uint8_t  mem_08[8];
-    uint16_t mem_16[4];
-    uint32_t mem_32[2];
-    uint64_t mem_64[1];
+    uint8_t             mem_08[8];
+    uint16_t            mem_16[4];
+    uint32_t            mem_32[2];
+    uint64_t            mem_64[1];
 } buffer_64;
 
 typedef struct {
-	uint64_t  H[8];          //512 bits
-	uint32_t  total;
-	uint32_t  buflen;
-	buffer_64 buffer[16];	//1024bits
-} sha512_ctx;
-
-typedef struct {
-	uint32_t rounds;
-	uint8_t  saltlen;
-	uint8_t  salt[SALT_SIZE];
+	uint32_t        rounds;
+	uint32_t        length;
+	uint8_t         salt[SALT_SIZE];
 } crypt_sha512_salt;
 
 typedef struct {
-	uint8_t length;
-	uint8_t v[PLAINTEXT_LENGTH];
+	uint32_t        length;
+	uint8_t         pass[PLAINTEXT_LENGTH];
 } crypt_sha512_password;
 
 typedef struct {
-	uint64_t v[8];		//512 bits
+	uint64_t        v[8];		//512 bits
 } crypt_sha512_hash;
 
 typedef struct {
-        crypt_sha512_password  pass_info;
-        crypt_sha512_salt      salt_info;
-        sha512_ctx             ctx_info;
-        buffer_64              alt_result[8];
-        buffer_64              temp_result[8];
-        uint8_t                s_sequence[SALT_SIZE];
-        uint8_t                p_sequence[PLAINTEXT_LENGTH];
-} working_memory;
+	uint64_t        H[8];           //512 bits
+	uint32_t        total;
+	uint32_t        buflen;
+	buffer_64       buffer[16];	//1024bits  
+} sha512_ctx;
 
+typedef struct {
+        sha512_ctx              ctx_data;
+        crypt_sha512_password   pass_data;
+        buffer_64               alt_result[8];
+        buffer_64               temp_result[8];
+        buffer_64               p_sequence[8];
+} working_memory;
 #endif
\ No newline at end of file
diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c
index a26c6ea..125263b 100644
--- a/src/opencl_cryptsha512_fmt.c
+++ b/src/opencl_cryptsha512_fmt.c
@@ -26,13 +26,13 @@
 #define LWS_CONFIG			"cryptsha512_LWS"
 #define KPC_CONFIG			"cryptsha512_KPC"
 
-static crypt_sha512_password            *plaintext;     // plaintext ciphertexts
-static crypt_sha512_hash                *out_hashes;    // calculated hashes
-static crypt_sha512_salt                salt_data;
+static crypt_sha512_salt                salt;
+static crypt_sha512_password            *plaintext;        // plaintext ciphertexts
+static crypt_sha512_hash                *calculated_hash;  // calculated hashes
 
-cl_mem salt_info;       //Salt information.
-cl_mem buffer_in;       //Plaintext buffer.
-cl_mem buffer_out;      //Hash keys (output)
+cl_mem salt_buffer;        //Salt information.
+cl_mem pass_buffer;        //Plaintext buffer.
+cl_mem hash_buffer;        //Hash keys (output)
 cl_mem pinned_saved_keys, pinned_partial_hashes;
 
 cl_command_queue queue_prof;
@@ -50,13 +50,26 @@ static struct fmt_tests tests[] = {
     {NULL}
 }; 
 
+size_t get_current_work_group_size(int dev_id)
+{
+        size_t max_group_size;
+
+        HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id],
+                       CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), 
+                       &max_group_size, NULL),
+                       "Error querying clGetKernelWorkGroupInfo");
+
+        return max_group_size;
+}
+
 /* ------- Helper functions ------- */
 uint get_task_max_work_group_size(){
     uint max_available;
-    max_available = get_local_memory_size(gpu_id) / sizeof(working_memory);
-    
-    if (max_available > get_max_work_group_size(gpu_id))
-        return get_max_work_group_size(gpu_id);
+    max_available = get_local_memory_size(gpu_id) /
+            (sizeof(working_memory) + sizeof(crypt_sha512_salt));
+
+    if (max_available > get_current_work_group_size(gpu_id))
+        return get_current_work_group_size(gpu_id);
     
     return max_available;
 }
@@ -88,38 +101,42 @@ static void create_clobj(int kpc) {
             sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes");
 
-    out_hashes = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id],
+    calculated_hash = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id],
             pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, 
             sizeof(crypt_sha512_hash) * kpc, 0, NULL, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes");
 
     // create arguments (buffers)
-    salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
+    salt_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
             sizeof(crypt_sha512_salt), NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating data_info out argument");
      
-    buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
+    pass_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
             sizeof(crypt_sha512_password) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");
 
-    buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
+    hash_buffer = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
             sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");
 
     //Set kernel arguments
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof (cl_mem),
-            (void *) &salt_info), "Error setting argument 0");
+            (void *) &salt_buffer), "Error setting argument 0");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem),
-            (void *) &buffer_in), "Error setting argument 1");
+            (void *) &pass_buffer), "Error setting argument 1");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem),
-            (void *) &buffer_out), "Error setting argument 2");     
+            (void *) &hash_buffer), "Error setting argument 2");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3,   //Fast working memory.
+            sizeof (crypt_sha512_salt),
+            NULL), "Error setting argument 3");
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 4,   //Fast working memory.
             sizeof (working_memory) * local_work_size,
-            NULL), "Error setting argument 3");   
- 
+            NULL), "Error setting argument 4");
+        
     memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc);
-    salt_data.saltlen = 0;
-    salt_data.rounds = 0;
+    memset(salt.salt, '\0', SALT_SIZE);
+    salt.length = 0;
+    salt.rounds = 0;
     max_keys_per_crypt = kpc;
 }
 
@@ -127,18 +144,18 @@ static void release_clobj(void) {
     cl_int ret_code;
 
     ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_partial_hashes,
-            out_hashes, 0, NULL, NULL);
+            calculated_hash, 0, NULL, NULL);
     HANDLE_CLERROR(ret_code, "Error Ummapping out_hashes");
     
     ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_saved_keys,
             plaintext, 0, NULL, NULL);
     HANDLE_CLERROR(ret_code, "Error Ummapping saved_plain");
     
-    ret_code = clReleaseMemObject(salt_info);
+    ret_code = clReleaseMemObject(salt_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing data_info");
-    ret_code = clReleaseMemObject(buffer_in);
+    ret_code = clReleaseMemObject(pass_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing buffer_keys");
-    ret_code = clReleaseMemObject(buffer_out);
+    ret_code = clReleaseMemObject(hash_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing buffer_out");
     
     ret_code = clReleaseMemObject(pinned_saved_keys);
@@ -152,13 +169,13 @@ static void release_clobj(void) {
 static void set_key(char *key, int index) {
     int len = strlen(key);
     plaintext[index].length = len;
-    memcpy(plaintext[index].v, key, len);
+    memcpy(plaintext[index].pass, key, len);
     new_keys = 1;
 }
 
 static char *get_key(int index) {
     static char ret[PLAINTEXT_LENGTH + 1];
-    memcpy(ret, plaintext[index].v, PLAINTEXT_LENGTH);
+    memcpy(ret, plaintext[index].pass, PLAINTEXT_LENGTH);
     ret[plaintext[index].length] = '\0';
     return ret;
 }
@@ -192,10 +209,10 @@ static void find_best_workgroup(void) {
     for (i = 0; i < get_task_max_size(); i++) {
         set_key("aaabaabaaa", i);
     }
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0,
-            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_TRUE, 0,
+            sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer I");
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_TRUE, 0, 
             sizeof (crypt_sha512_password) * get_task_max_size(), 
             plaintext, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer II");
@@ -256,7 +273,7 @@ static void find_best_kpc(void) {
         tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num);
         
         if (tmpbuffer == NULL) {
-            printf ("Failed in malloc inside find_best_kpc\n");
+            printf ("Malloc failure in find_best_kpc\n");
             exit (EXIT_FAILURE);
         }
         
@@ -268,15 +285,15 @@ static void find_best_kpc(void) {
         for (i = 0; i < num; i++) {
             set_key("aaabaabaaa", i);
         }
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
-                sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0,
+                sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
                 "Failed in clEnqueueWriteBuffer I");
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_FALSE, 0, 
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, 
                 sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL),
                 "Failed in clEnqueueWriteBuffer II");
         ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
                 1, NULL, &num, &local_work_size, 0, NULL, &myEvent);
-        HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, buffer_out, CL_FALSE, 0,
+        HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0,
                 sizeof (crypt_sha512_hash) * num, tmpbuffer, 0, NULL, NULL),
                 "Failed in clEnqueueReadBuffer");
         HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
@@ -354,15 +371,15 @@ static void init(struct fmt_main *pFmt) {
     if (max_keys_per_crypt)
         create_clobj(max_keys_per_crypt);
 
-    else {
+    else { 
         //user chose to die of boredom
         max_keys_per_crypt = get_task_max_size();
         create_clobj(max_keys_per_crypt);
         find_best_kpc();
     }
-    printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", 
+    printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n",
            (int) local_work_size, max_keys_per_crypt);   
-    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
+    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;     
 }
 
 /* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */
@@ -406,13 +423,13 @@ static void *get_salt(char *ciphertext) {
     return (void *) ret;
 }
 
-static void set_salt(void *salt) {
-    unsigned char *s = salt;
-    int len = strlen(salt);
-    static char currentsalt[64];
-    memcpy(currentsalt, s, len + 1);
+static void set_salt(void *salt_info) {    
+    int len = strlen(salt_info);
     unsigned char offset = 0;
-    salt_data.rounds = ROUNDS_DEFAULT;
+    static char currentsalt[64];
+    
+    memcpy(currentsalt, (char *) salt_info, len + 1);
+    salt.rounds = ROUNDS_DEFAULT;
 
     if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0)
         offset += 3;
@@ -424,13 +441,14 @@ static void set_salt(void *salt) {
 
         if (*endp == '$') {
             endp += 1;
-            salt_data.rounds =
+            salt.rounds =
                     MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX));
         }
         offset = endp - currentsalt;
     }
-    memcpy(salt_data.salt, currentsalt + offset, SALT_SIZE);
-    salt_data.saltlen = strlen((char *) salt_data.salt);
+    memcpy(salt.salt, currentsalt + offset, SALT_SIZE);
+    salt.length = strlen((char *) salt.salt);
+    salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length);
 }
 
 /* ------- To binary functions ------- */
@@ -496,7 +514,7 @@ static int cmp_all(void *binary, int count) {
     uint64_t b = ((uint64_t *) binary)[0];
 
     for (i = 0; i < count; i++)
-        if (b == out_hashes[i].v[0])
+        if (b == calculated_hash[i].v[0])
             return 1;
     return 0;
 }
@@ -506,7 +524,7 @@ static int cmp_one(void *binary, int index) {
     uint64_t *t = (uint64_t *) binary;
     
     for (i = 0; i < 8; i++) {
-        if (t[i] != out_hashes[index].v[i])
+        if (t[i] != calculated_hash[index].v[i])
             return 0;
     }
     return 1;
@@ -519,11 +537,11 @@ static int cmp_exact(char *source, int count) {
 /* ------- Crypt function ------- */
 static void crypt_all(int count) {
     //Send data to the dispositive
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
-            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0,
+            sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
             "failed in clEnqueueWriteBuffer data_info");
     if (new_keys)
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_in, CL_FALSE, 0,
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], pass_buffer, CL_FALSE, 0,
                 sizeof(crypt_sha512_password) * max_keys_per_crypt, plaintext, 0, NULL, NULL),
                 "failed in clEnqueueWriteBuffer buffer_in");
 
@@ -531,10 +549,10 @@ static void crypt_all(int count) {
     HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL,
             &max_keys_per_crypt, &local_work_size, 0, NULL, NULL),
             "failed in clEnqueueNDRangeKernel");
-
+    
     //Read back hashes
-    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0,
-            sizeof(crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], hash_buffer, CL_FALSE, 0,
+            sizeof(crypt_sha512_hash) * max_keys_per_crypt, calculated_hash, 0, NULL, NULL),
             "failed in reading data back");
  
     //Do the work
@@ -552,13 +570,13 @@ static int binary_hash_5(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF
 static int binary_hash_6(void * binary) { return *(ARCH_WORD_32 *) binary & 0x7FFFFFF; }
 
 //Get Hash functions group.
-static int get_hash_0(int index) { return out_hashes[index].v[0] & 0xF; }
-static int get_hash_1(int index) { return out_hashes[index].v[0] & 0xFF; }
-static int get_hash_2(int index) { return out_hashes[index].v[0] & 0xFFF; }
-static int get_hash_3(int index) { return out_hashes[index].v[0] & 0xFFFF; }
-static int get_hash_4(int index) { return out_hashes[index].v[0] & 0xFFFFF; }
-static int get_hash_5(int index) { return out_hashes[index].v[0] & 0xFFFFFF; }
-static int get_hash_6(int index) { return out_hashes[index].v[0] & 0x7FFFFFF; }
+static int get_hash_0(int index) { return calculated_hash[index].v[0] & 0xF; }
+static int get_hash_1(int index) { return calculated_hash[index].v[0] & 0xFF; }
+static int get_hash_2(int index) { return calculated_hash[index].v[0] & 0xFFF; }
+static int get_hash_3(int index) { return calculated_hash[index].v[0] & 0xFFFF; }
+static int get_hash_4(int index) { return calculated_hash[index].v[0] & 0xFFFFF; }
+static int get_hash_5(int index) { return calculated_hash[index].v[0] & 0xFFFFFF; }
+static int get_hash_6(int index) { return calculated_hash[index].v[0] & 0x7FFFFFF; }
 
 /* ------- Format structure ------- */
 struct fmt_main fmt_opencl_cryptsha512 = {
@@ -611,4 +629,4 @@ struct fmt_main fmt_opencl_cryptsha512 = {
         cmp_one,
         cmp_exact
     }
-};
\ No newline at end of file
+};
-- 
1.7.5.4


Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ