>From 1fb1b372eee0a37799cdc5e6511402be4f562267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Claudio=20Andr=C3=A9?= <claudio.andre@correios.net.br>
Date: Wed, 18 Apr 2012 11:29:13 -0300
Subject: [PATCH] Minor improvements

---
 src/opencl/cryptsha512_kernel.cl |  230 +++++++++++++++++++++++---------------
 src/opencl_cryptsha512.h         |   57 +++++-----
 src/opencl_cryptsha512_fmt.c     |  144 +++++++++++++-----------
 3 files changed, 248 insertions(+), 183 deletions(-)

diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl
index c3e7ca9..be0ddcd 100644
--- a/src/opencl/cryptsha512_kernel.cl
+++ b/src/opencl/cryptsha512_kernel.cl
@@ -49,23 +49,67 @@ void init_ctx(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-inline void memcpy_08(__local uint8_t * dest, __local const uint8_t * src, const size_t n) {
+inline void memcpy(__local uint8_t       * dest, 
+                   __local const uint8_t * src, const size_t n) {
     for (int i = 0; i < n; i++)
         dest[i] = src[i];
 }
 
-inline void memcpy_64(__local uint8_t * dest, __local buffer_64 * src, const size_t n) {
-    for (int i = 0; i < n; i++)
-        dest[i] = src->mem_08[i];
+inline bool is_not_divisible_by_3(int n) {
+#ifndef SLOW_MODULO
+    return ((n % 3) != 0);
+
+#else
+    int sum;
+
+    do
+    {
+        sum = 0;
+
+        while(n)
+        {
+            sum += n & 3;
+            n = n >> 2;
+        }
+        n = sum;
+    } while(sum > 3);
+
+    //Result to send back.
+    return !(sum == 3 || sum == 0);
+#endif
 }
 
-void insert_to_buffer(__local sha512_ctx * ctx, 
+inline bool is_not_divisible_by_7(int n) {
+#ifndef SLOW_MODULO
+    return ((n % 7) != 0);
+
+#else
+    int sum;
+
+    do
+    {
+        sum = 0;
+
+        while(n)
+        {
+            sum += n & 7;
+            n = n >> 3;
+        }
+        n = sum;
+    } while(sum > 7);
+
+    //Result to send back.
+    return !(sum == 7 || sum == 0);
+#endif
+}
+
+void insert_to_buffer(__local sha512_ctx    * ctx, 
                       __local const uint8_t * string,
                       const uint8_t len) {
     __local uint8_t *d;
     d = ctx->buffer->mem_08 + ctx->buflen;  //ctx->buffer[buflen] (in char size)
 
-    memcpy_08(d, string, len);
+    memcpy(d, string, len);
     ctx->buflen += len;
 }
 
@@ -77,36 +121,29 @@ void sha512_block(__local sha512_ctx * ctx) {
     uint64_t e = ctx->H[4];
     uint64_t f = ctx->H[5];
     uint64_t g = ctx->H[6];
-    uint64_t h = ctx->H[7];
-
+    uint64_t h = ctx->H[7];    
+    uint64_t t1, t2;
     uint64_t w[16];
 
+#ifdef DEVICE_IS_CPU
     #pragma unroll 16
     for (int i = 0; i < 16; i++)
         w[i] = SWAP64(ctx->buffer->mem_64[i]);
-
-    uint64_t t1, t2;
-    #pragma unroll 16
-    for (int i = 0; i < 16; i++) {
-        t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g);
-        t2 = Maj(a, b, c) + Sigma0(a);
-
-        h = g;
-        g = f;
-        f = e;
-        e = d + t1;
-        d = c;
-        c = b;
-        b = a;
-        a = t1 + t2;
-    }
-
-    #pragma unroll 64
-    for (int i = 16; i < 80; i++) {
-        w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
+#else
+    ulong16  w_vector;
+    w_vector = vload16(0, ctx->buffer->mem_64); 
+    w_vector = SWAP64(w_vector);
+    vstore16(w_vector, 0, w);
+#endif
+
+    #pragma unroll 80
+    for (int i = 0; i < 80; i++) {
+
+        if (i > 15) {
+            w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15];
+        }
         t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g);
         t2 = Maj(a, b, c) + Sigma0(a);
-
         h = g;
         g = f;
         f = e;
@@ -128,9 +165,9 @@ void sha512_block(__local sha512_ctx * ctx) {
 }
 
 void ctx_append_1(__local sha512_ctx * ctx) {
-    uint32_t length = ctx->buflen;
-    int i = 127 - length;
-    __local uint8_t *d = ctx->buffer->mem_08 + length;
+    int i = 127 - ctx->buflen;
+    __local uint8_t * d = ctx->buffer->mem_08 + ctx->buflen;
+
     *d++ = 0x80;
 
     while (i--) {
@@ -149,7 +186,8 @@ void finish_ctx(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-void ctx_update(__local sha512_ctx * ctx, __local uint8_t *string, uint8_t len) {
+void ctx_update(__local sha512_ctx * ctx, 
+                __local uint8_t    * string, uint8_t len) {
 
     ctx->total += len;
     uint8_t startpos = ctx->buflen;
@@ -175,20 +213,22 @@ void clear_ctx_buffer(__local sha512_ctx * ctx) {
     ctx->buflen = 0;
 }
 
-void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) {
+void sha512_digest(__local  sha512_ctx * ctx, 
+                   __local  uint64_t   * result) {
 
     if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block
         finish_ctx(ctx);
 
     } else {
-        uint8_t moved = 1;
+        bool moved = true;
 
         if (ctx->buflen < 128) { //data and 0x80 fits in one block
             ctx_append_1(ctx);
-            moved = 0;
+            moved = false;
         }
         sha512_block(ctx);
         clear_ctx_buffer(ctx);
+
         if (moved)
             ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean
         ctx_add_length(ctx);
@@ -200,72 +240,69 @@ void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) {
         result[i] = SWAP64(ctx->H[i]);
 }
 
-void sha512crypt(__local working_memory * tmp_working,
+void sha512crypt(__local  working_memory    * fast_tmp_memory, 
+                 __local  crypt_sha512_salt * salt_data,                  
                  __global crypt_sha512_hash * output) {
 
-#define pass        tmp_working->pass_info.v
-#define passlength  tmp_working->pass_info.length
-#define salt        tmp_working->salt_info.salt
-#define saltlen     tmp_working->salt_info.saltlen
-#define rounds      tmp_working->salt_info.rounds
-#define alt_result  tmp_working->alt_result
-#define temp_result tmp_working->temp_result
-#define s_sequence  tmp_working->s_sequence
-#define p_sequence  tmp_working->p_sequence
-#define ctx         tmp_working->ctx_info
+#define pass        fast_tmp_memory->pass_data.pass
+#define passlen     fast_tmp_memory->pass_data.length
+#define salt        salt_data->salt
+#define saltlen     salt_data->length
+#define rounds      salt_data->rounds
+#define alt_result  fast_tmp_memory->alt_result
+#define temp_result fast_tmp_memory->temp_result
+#define p_sequence  fast_tmp_memory->p_sequence
+#define ctx         fast_tmp_memory->ctx_data
 
     init_ctx(&ctx);
 
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
     ctx_update(&ctx, salt, saltlen);
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
 
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
-    ctx_update(&ctx, pass, passlength);
+    ctx_update(&ctx, pass, passlen);
     ctx_update(&ctx, salt, saltlen);
-    ctx_update(&ctx, alt_result->mem_08, passlength);
+    ctx_update(&ctx, alt_result->mem_08, passlen);
 
-    for (int i = passlength; i > 0; i >>= 1) {
+    for (int i = passlen; i > 0; i >>= 1) {
         ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : pass),
-                         ((i & 1) != 0 ? 64 :                 passlength));
+                         ((i & 1) != 0 ? 64 :                 passlen));
     }
     sha512_digest(&ctx, alt_result->mem_64);
     init_ctx(&ctx);
 
-    for (int i = 0; i < passlength; i++)
-        ctx_update(&ctx, pass, passlength);
-
-    sha512_digest(&ctx, temp_result->mem_64);
-    memcpy_64(p_sequence, temp_result, passlength);
+    for (int i = 0; i < passlen; i++)
+        ctx_update(&ctx, pass, passlen);
 
+    sha512_digest(&ctx, p_sequence->mem_64);
     init_ctx(&ctx);
     
-    /* For every character in the password add the entire password.  */
+    /* For every character in the password add the entire password. */
     for (int i = 0; i < 16 + (alt_result->mem_08)[0]; i++)
         ctx_update(&ctx, salt, saltlen);
 
     /* Finish the digest.  */
     sha512_digest(&ctx, temp_result->mem_64);
-    memcpy_64(s_sequence, temp_result, saltlen);
 
     /* Repeatedly run the collected hash value through SHA512 to
        burn CPU cycles.  */
     for (int i = 0; i < rounds; i++) {
         init_ctx(&ctx);
 
-        ctx_update(&ctx, ((i & 1) != 0 ? p_sequence : alt_result->mem_08),
-                         ((i & 1) != 0 ? passlength : 64)); 
+        ctx_update(&ctx, ((i & 1) != 0 ? p_sequence->mem_08 : alt_result->mem_08),
+                         ((i & 1) != 0 ? passlen : 64)); 
 
-        if ((i % 3) != 0)
-            ctx_update(&ctx, s_sequence, saltlen);
+        if (is_not_divisible_by_3(i))
+            ctx_update(&ctx, temp_result->mem_08, saltlen);
 
-        if ((i % 7) != 0)
-            ctx_update(&ctx, p_sequence, passlength);
+        if (is_not_divisible_by_7(i))
+            ctx_update(&ctx, p_sequence->mem_08, passlen);
 
-        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence),
-                         ((i & 1) != 0 ? 64 :                 passlength));
+        ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence->mem_08),
+                          ((i & 1) != 0 ? 64 :                 passlen));
         sha512_digest(&ctx, alt_result->mem_64);
     }
     //Send results to the host.
@@ -276,54 +313,67 @@ void sha512crypt(__local working_memory * tmp_working,
 #undef salt       
 #undef saltlen    
 #undef rounds   
+#undef pass
 
-__kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt,
-                           __global   crypt_sha512_password * inbuffer,
-                           __global   crypt_sha512_hash * outbuffer,
-                           __local    working_memory * tmp_memory) {
+__kernel void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
+                           __global   crypt_sha512_password * pass_data,
+                           __global   crypt_sha512_hash   * out_buffer,
+                           __local    crypt_sha512_salt   * salt_data,
+                           __local    working_memory      * fast_tmp_memory) {
 
     //Get the task to be done
-    uint32_t gid = get_global_id(0);
-    uint32_t lid = get_local_id(0);
+    size_t gid = get_global_id(0);
+    size_t lid = get_local_id(0);
 
     //Transfer data to faster memory
     //Password information
-    tmp_memory[lid].pass_info.length = inbuffer[gid].length;
+    fast_tmp_memory[lid].pass_data.length = pass_data[gid].length;
 
     #pragma unroll PLAINTEXT_LENGTH
     for (int i = 0; i < PLAINTEXT_LENGTH; i++)
-        tmp_memory[lid].pass_info.v[i] = inbuffer[gid].v[i]; 
+        fast_tmp_memory[lid].pass_data.pass[i] = pass_data[gid].pass[i]; 
  
-    //Salt information. 
-    tmp_memory[lid].salt_info.saltlen = hsalt->saltlen;  
-    tmp_memory[lid].salt_info.rounds = hsalt->rounds;
-
-    #pragma unroll SALT_SIZE
-    for (int i = 0; i < SALT_SIZE; i++)
-	tmp_memory[lid].salt_info.salt[i] = hsalt->salt[i];
+    if (lid == 0){
+        //Copy salt information to fast local memory. Only once in a group.
+        salt_data->length = informed_salt->length;  
+        salt_data->rounds = informed_salt->rounds;
+
+        #pragma unroll SALT_SIZE
+        for (int i = 0; i < SALT_SIZE; i++)
+            salt_data->salt[i] = informed_salt->salt[i];
+    }
 
     //Do the job
-    sha512crypt(&tmp_memory[lid], &outbuffer[gid]);
+    sha512crypt(&fast_tmp_memory[lid], salt_data, &out_buffer[gid]);
 }
 
 /***
 *    To improve performance, it uses __local memory to keep working variables 
 * (password, temp buffers, etc). In SHA 512 it means about 350 bytes per 
-* "thread". It improves performance a lot, but, local memory is a scarce 
+* "thread". It improves performance, but, local memory is a scarce 
 * resource. 
 *    It means the max group size allowed in OpenCL SHA 512 is going to be 
-* 128 (hardware depended).
+* 64 (it depends on hardware local memory size).
 *
 * Gain   Optimizations
 *  --    Basic version, private and global variables only.
 *        Transfer all the working variables to local memory.
-* -10%   Move salt to constant memory space. Keep others in local (saves memory).IGNORED.
+* -10%   Move salt to constant memory space. Keep others in local (saves memory). BAD.
 *  25%   Unrool main loops.
 *   5%   Unrool other loops.
-* -INF   Do the compare task on GPU.
+*  ###   Do the compare task on GPU.
+*   5%   Remove some unecessary code.
+*  ###   Move almost everything to global and local memory. BAD.
+*   1%   Use vector types in SHA_Block in some variables. 
 *
 * Conclusions
 * - Compare on GPU: CPU is more efficient for now.
 * - Salt on constant memory is not good enought.
-* - No register spilling happens after optimization.
-***/
+* - No register spilling happens after optimization. Although, might need to use less registers.
+* - Tried to use "only" local and global memory. Got register spilling again.
+* - Vectorized do not give better performance, but result in less instructions.
+*   In reality, I'm not doing vector operations (doing the same thing in n bytes), 
+*   so should not expect big gains anyway.
+*   If i have a lot of memory, i might solve more than one hash at once 
+*   (and use more vectors). But it is not possible (at least for a while).
+***/
\ No newline at end of file
diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h
index e920c6e..7799c0e 100644
--- a/src/opencl_cryptsha512.h
+++ b/src/opencl_cryptsha512.h
@@ -18,7 +18,7 @@
 #define uint8_t  unsigned char
 #define uint16_t unsigned short
 #define uint32_t unsigned int
-#define uint64_t ulong  //Tip: unsigned long long int failed on compile (AMD).
+#define uint64_t unsigned long  //Tip: unsigned long long int failed on compile (AMD).
 
 //Functions.
 #define MAX(x,y)                ((x) > (y) ? (x) : (y))
@@ -29,7 +29,7 @@
 #define ROUNDS_MAX              999999999
 
 #define SALT_SIZE               16
-#define PLAINTEXT_LENGTH        16     
+#define PLAINTEXT_LENGTH        16
 #define BINARY_SIZE             (3+16+86)       ///TODO: Magic number?
 
 #define KEYS_PER_CORE_CPU       512
@@ -37,8 +37,8 @@
 #define MIN_KEYS_PER_CRYPT	128
 #define MAX_KEYS_PER_CRYPT	2048*2048*128
 
-#define rol(x,n)                rotate(x,n) 
-#define ror(x,n)                rotate(x, (ulong) 64-n)
+#define rol(x,n)                rotate(x, n) 
+#define ror(x,n)                rotate(x, (uint64_t) 64-n)
 #define Ch(x,y,z)               ((x & y) ^ ( (~x) & z))
 #define Maj(x,y,z)              ((x & y) ^ (x & z) ^ (y & z))
 #define Sigma0(x)               ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39)))
@@ -46,7 +46,7 @@
 #define sigma0(x)               ((ror(x,1))  ^ (ror(x,8))  ^ (x>>7))
 #define sigma1(x)               ((ror(x,19)) ^ (ror(x,61)) ^ (x>>6))
 
-# define SWAP64(n) \
+#define SWAP64(n) \
   (((n) << 56)					\
    | (((n) & 0xff00) << 40)			\
    | (((n) & 0xff0000) << 24)			\
@@ -58,42 +58,39 @@
 
 //Data types.
 typedef union {
-    uint8_t  mem_08[8];
-    uint16_t mem_16[4];
-    uint32_t mem_32[2];
-    uint64_t mem_64[1];
+    uint8_t             mem_08[8];
+    uint16_t            mem_16[4];
+    uint32_t            mem_32[2];
+    uint64_t            mem_64[1];
 } buffer_64;
 
 typedef struct {
-	uint64_t  H[8];          //512 bits
-	uint32_t  total;
-	uint32_t  buflen;
-	buffer_64 buffer[16];	//1024bits
-} sha512_ctx;
-
-typedef struct {
-	uint32_t rounds;
-	uint8_t  saltlen;
-	uint8_t  salt[SALT_SIZE];
+	uint32_t        rounds;
+	uint32_t        length;
+	uint8_t         salt[SALT_SIZE];
 } crypt_sha512_salt;
 
 typedef struct {
-	uint8_t length;
-	uint8_t v[PLAINTEXT_LENGTH];
+	uint32_t        length;
+	uint8_t         pass[PLAINTEXT_LENGTH];
 } crypt_sha512_password;
 
 typedef struct {
-	uint64_t v[8];		//512 bits
+	uint64_t        v[8];		//512 bits
 } crypt_sha512_hash;
 
 typedef struct {
-        crypt_sha512_password  pass_info;
-        crypt_sha512_salt      salt_info;
-        sha512_ctx             ctx_info;
-        buffer_64              alt_result[8];
-        buffer_64              temp_result[8];
-        uint8_t                s_sequence[SALT_SIZE];
-        uint8_t                p_sequence[PLAINTEXT_LENGTH];
-} working_memory;
+	uint64_t        H[8];           //512 bits
+	uint32_t        total;
+	uint32_t        buflen;
+	buffer_64       buffer[16];	//1024bits  
+} sha512_ctx;
 
+typedef struct {
+        sha512_ctx              ctx_data;
+        crypt_sha512_password   pass_data;
+        buffer_64               alt_result[8];
+        buffer_64               temp_result[8];
+        buffer_64               p_sequence[8];
+} working_memory;
 #endif
\ No newline at end of file
diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c
index a26c6ea..125263b 100644
--- a/src/opencl_cryptsha512_fmt.c
+++ b/src/opencl_cryptsha512_fmt.c
@@ -26,13 +26,13 @@
 #define LWS_CONFIG			"cryptsha512_LWS"
 #define KPC_CONFIG			"cryptsha512_KPC"
 
-static crypt_sha512_password            *plaintext;     // plaintext ciphertexts
-static crypt_sha512_hash                *out_hashes;    // calculated hashes
-static crypt_sha512_salt                salt_data;
+static crypt_sha512_salt                salt;
+static crypt_sha512_password            *plaintext;        // plaintext ciphertexts
+static crypt_sha512_hash                *calculated_hash;  // calculated hashes
 
-cl_mem salt_info;       //Salt information.
-cl_mem buffer_in;       //Plaintext buffer.
-cl_mem buffer_out;      //Hash keys (output)
+cl_mem salt_buffer;        //Salt information.
+cl_mem pass_buffer;        //Plaintext buffer.
+cl_mem hash_buffer;        //Hash keys (output)
 cl_mem pinned_saved_keys, pinned_partial_hashes;
 
 cl_command_queue queue_prof;
@@ -50,13 +50,26 @@ static struct fmt_tests tests[] = {
     {NULL}
 }; 
 
+size_t get_current_work_group_size(int dev_id)
+{
+        size_t max_group_size;
+
+        HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id],
+                       CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), 
+                       &max_group_size, NULL),
+                       "Error querying clGetKernelWorkGroupInfo");
+
+        return max_group_size;
+}
+
 /* ------- Helper functions ------- */
 uint get_task_max_work_group_size(){
     uint max_available;
-    max_available = get_local_memory_size(gpu_id) / sizeof(working_memory);
-    
-    if (max_available > get_max_work_group_size(gpu_id))
-        return get_max_work_group_size(gpu_id);
+    max_available = get_local_memory_size(gpu_id) /
+            (sizeof(working_memory) + sizeof(crypt_sha512_salt));
+
+    if (max_available > get_current_work_group_size(gpu_id))
+        return get_current_work_group_size(gpu_id);
     
     return max_available;
 }
@@ -88,38 +101,42 @@ static void create_clobj(int kpc) {
             sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes");
 
-    out_hashes = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id],
+    calculated_hash = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id],
             pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, 
             sizeof(crypt_sha512_hash) * kpc, 0, NULL, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes");
 
     // create arguments (buffers)
-    salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
+    salt_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, 
             sizeof(crypt_sha512_salt), NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating data_info out argument");
      
-    buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
+    pass_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
             sizeof(crypt_sha512_password) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");
 
-    buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
+    hash_buffer = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
             sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");
 
     //Set kernel arguments
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof (cl_mem),
-            (void *) &salt_info), "Error setting argument 0");
+            (void *) &salt_buffer), "Error setting argument 0");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem),
-            (void *) &buffer_in), "Error setting argument 1");
+            (void *) &pass_buffer), "Error setting argument 1");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem),
-            (void *) &buffer_out), "Error setting argument 2");     
+            (void *) &hash_buffer), "Error setting argument 2");
     HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3,   //Fast working memory.
+            sizeof (crypt_sha512_salt),
+            NULL), "Error setting argument 3");
+    HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 4,   //Fast working memory.
             sizeof (working_memory) * local_work_size,
-            NULL), "Error setting argument 3");   
- 
+            NULL), "Error setting argument 4");
+        
     memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc);
-    salt_data.saltlen = 0;
-    salt_data.rounds = 0;
+    memset(salt.salt, '\0', SALT_SIZE);
+    salt.length = 0;
+    salt.rounds = 0;
     max_keys_per_crypt = kpc;
 }
 
@@ -127,18 +144,18 @@ static void release_clobj(void) {
     cl_int ret_code;
 
     ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_partial_hashes,
-            out_hashes, 0, NULL, NULL);
+            calculated_hash, 0, NULL, NULL);
     HANDLE_CLERROR(ret_code, "Error Ummapping out_hashes");
     
     ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_saved_keys,
             plaintext, 0, NULL, NULL);
     HANDLE_CLERROR(ret_code, "Error Ummapping saved_plain");
     
-    ret_code = clReleaseMemObject(salt_info);
+    ret_code = clReleaseMemObject(salt_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing data_info");
-    ret_code = clReleaseMemObject(buffer_in);
+    ret_code = clReleaseMemObject(pass_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing buffer_keys");
-    ret_code = clReleaseMemObject(buffer_out);
+    ret_code = clReleaseMemObject(hash_buffer);
     HANDLE_CLERROR(ret_code, "Error Releasing buffer_out");
     
     ret_code = clReleaseMemObject(pinned_saved_keys);
@@ -152,13 +169,13 @@ static void release_clobj(void) {
 static void set_key(char *key, int index) {
     int len = strlen(key);
     plaintext[index].length = len;
-    memcpy(plaintext[index].v, key, len);
+    memcpy(plaintext[index].pass, key, len);
     new_keys = 1;
 }
 
 static char *get_key(int index) {
     static char ret[PLAINTEXT_LENGTH + 1];
-    memcpy(ret, plaintext[index].v, PLAINTEXT_LENGTH);
+    memcpy(ret, plaintext[index].pass, PLAINTEXT_LENGTH);
     ret[plaintext[index].length] = '\0';
     return ret;
 }
@@ -192,10 +209,10 @@ static void find_best_workgroup(void) {
     for (i = 0; i < get_task_max_size(); i++) {
         set_key("aaabaabaaa", i);
     }
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0,
-            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_TRUE, 0,
+            sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer I");
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, 
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_TRUE, 0, 
             sizeof (crypt_sha512_password) * get_task_max_size(), 
             plaintext, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer II");
@@ -256,7 +273,7 @@ static void find_best_kpc(void) {
         tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num);
         
         if (tmpbuffer == NULL) {
-            printf ("Failed in malloc inside find_best_kpc\n");
+            printf ("Malloc failure in find_best_kpc\n");
             exit (EXIT_FAILURE);
         }
         
@@ -268,15 +285,15 @@ static void find_best_kpc(void) {
         for (i = 0; i < num; i++) {
             set_key("aaabaabaaa", i);
         }
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
-                sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0,
+                sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
                 "Failed in clEnqueueWriteBuffer I");
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_FALSE, 0, 
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, 
                 sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL),
                 "Failed in clEnqueueWriteBuffer II");
         ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
                 1, NULL, &num, &local_work_size, 0, NULL, &myEvent);
-        HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, buffer_out, CL_FALSE, 0,
+        HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0,
                 sizeof (crypt_sha512_hash) * num, tmpbuffer, 0, NULL, NULL),
                 "Failed in clEnqueueReadBuffer");
         HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
@@ -354,15 +371,15 @@ static void init(struct fmt_main *pFmt) {
     if (max_keys_per_crypt)
         create_clobj(max_keys_per_crypt);
 
-    else {
+    else { 
         //user chose to die of boredom
         max_keys_per_crypt = get_task_max_size();
         create_clobj(max_keys_per_crypt);
         find_best_kpc();
     }
-    printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", 
+    printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n",
            (int) local_work_size, max_keys_per_crypt);   
-    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
+    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;     
 }
 
 /* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */
@@ -406,13 +423,13 @@ static void *get_salt(char *ciphertext) {
     return (void *) ret;
 }
 
-static void set_salt(void *salt) {
-    unsigned char *s = salt;
-    int len = strlen(salt);
-    static char currentsalt[64];
-    memcpy(currentsalt, s, len + 1);
+static void set_salt(void *salt_info) {    
+    int len = strlen(salt_info);
     unsigned char offset = 0;
-    salt_data.rounds = ROUNDS_DEFAULT;
+    static char currentsalt[64];
+    
+    memcpy(currentsalt, (char *) salt_info, len + 1);
+    salt.rounds = ROUNDS_DEFAULT;
 
     if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0)
         offset += 3;
@@ -424,13 +441,14 @@ static void set_salt(void *salt) {
 
         if (*endp == '$') {
             endp += 1;
-            salt_data.rounds =
+            salt.rounds =
                     MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX));
         }
         offset = endp - currentsalt;
     }
-    memcpy(salt_data.salt, currentsalt + offset, SALT_SIZE);
-    salt_data.saltlen = strlen((char *) salt_data.salt);
+    memcpy(salt.salt, currentsalt + offset, SALT_SIZE);
+    salt.length = strlen((char *) salt.salt);
+    salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length);
 }
 
 /* ------- To binary functions ------- */
@@ -496,7 +514,7 @@ static int cmp_all(void *binary, int count) {
     uint64_t b = ((uint64_t *) binary)[0];
 
     for (i = 0; i < count; i++)
-        if (b == out_hashes[i].v[0])
+        if (b == calculated_hash[i].v[0])
             return 1;
     return 0;
 }
@@ -506,7 +524,7 @@ static int cmp_one(void *binary, int index) {
     uint64_t *t = (uint64_t *) binary;
     
     for (i = 0; i < 8; i++) {
-        if (t[i] != out_hashes[index].v[i])
+        if (t[i] != calculated_hash[index].v[i])
             return 0;
     }
     return 1;
@@ -519,11 +537,11 @@ static int cmp_exact(char *source, int count) {
 /* ------- Crypt function ------- */
 static void crypt_all(int count) {
     //Send data to the dispositive
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0,
-            sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0,
+            sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
             "failed in clEnqueueWriteBuffer data_info");
     if (new_keys)
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_in, CL_FALSE, 0,
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], pass_buffer, CL_FALSE, 0,
                 sizeof(crypt_sha512_password) * max_keys_per_crypt, plaintext, 0, NULL, NULL),
                 "failed in clEnqueueWriteBuffer buffer_in");
 
@@ -531,10 +549,10 @@ static void crypt_all(int count) {
     HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL,
             &max_keys_per_crypt, &local_work_size, 0, NULL, NULL),
             "failed in clEnqueueNDRangeKernel");
-
+    
     //Read back hashes
-    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0,
-            sizeof(crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL),
+    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], hash_buffer, CL_FALSE, 0,
+            sizeof(crypt_sha512_hash) * max_keys_per_crypt, calculated_hash, 0, NULL, NULL),
             "failed in reading data back");
  
     //Do the work
@@ -552,13 +570,13 @@ static int binary_hash_5(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF
 static int binary_hash_6(void * binary) { return *(ARCH_WORD_32 *) binary & 0x7FFFFFF; }
 
 //Get Hash functions group.
-static int get_hash_0(int index) { return out_hashes[index].v[0] & 0xF; }
-static int get_hash_1(int index) { return out_hashes[index].v[0] & 0xFF; }
-static int get_hash_2(int index) { return out_hashes[index].v[0] & 0xFFF; }
-static int get_hash_3(int index) { return out_hashes[index].v[0] & 0xFFFF; }
-static int get_hash_4(int index) { return out_hashes[index].v[0] & 0xFFFFF; }
-static int get_hash_5(int index) { return out_hashes[index].v[0] & 0xFFFFFF; }
-static int get_hash_6(int index) { return out_hashes[index].v[0] & 0x7FFFFFF; }
+static int get_hash_0(int index) { return calculated_hash[index].v[0] & 0xF; }
+static int get_hash_1(int index) { return calculated_hash[index].v[0] & 0xFF; }
+static int get_hash_2(int index) { return calculated_hash[index].v[0] & 0xFFF; }
+static int get_hash_3(int index) { return calculated_hash[index].v[0] & 0xFFFF; }
+static int get_hash_4(int index) { return calculated_hash[index].v[0] & 0xFFFFF; }
+static int get_hash_5(int index) { return calculated_hash[index].v[0] & 0xFFFFFF; }
+static int get_hash_6(int index) { return calculated_hash[index].v[0] & 0x7FFFFFF; }
 
 /* ------- Format structure ------- */
 struct fmt_main fmt_opencl_cryptsha512 = {
@@ -611,4 +629,4 @@ struct fmt_main fmt_opencl_cryptsha512 = {
         cmp_one,
         cmp_exact
     }
-};
\ No newline at end of file
+};
-- 
1.7.5.4