>From 1fb1b372eee0a37799cdc5e6511402be4f562267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Claudio=20Andr=C3=A9?= Date: Wed, 18 Apr 2012 11:29:13 -0300 Subject: [PATCH] Minor improvements --- src/opencl/cryptsha512_kernel.cl | 230 +++++++++++++++++++++++--------------- src/opencl_cryptsha512.h | 57 +++++----- src/opencl_cryptsha512_fmt.c | 144 +++++++++++++----------- 3 files changed, 248 insertions(+), 183 deletions(-) diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl index c3e7ca9..be0ddcd 100644 --- a/src/opencl/cryptsha512_kernel.cl +++ b/src/opencl/cryptsha512_kernel.cl @@ -49,23 +49,67 @@ void init_ctx(__local sha512_ctx * ctx) { ctx->buflen = 0; } -inline void memcpy_08(__local uint8_t * dest, __local const uint8_t * src, const size_t n) { +inline void memcpy(__local uint8_t * dest, + __local const uint8_t * src, const size_t n) { for (int i = 0; i < n; i++) dest[i] = src[i]; } -inline void memcpy_64(__local uint8_t * dest, __local buffer_64 * src, const size_t n) { - for (int i = 0; i < n; i++) - dest[i] = src->mem_08[i]; +inline bool is_not_divisible_by_3(int n) { +#ifndef SLOW_MODULO + return ((n % 3) != 0); + +#else + int sum; + + do + { + sum = 0; + + while(n) + { + sum += n & 3; + n = n >> 2; + } + n = sum; + } while(sum > 3); + + //Result to send back. + return !(sum == 3 || sum == 0); +#endif } -void insert_to_buffer(__local sha512_ctx * ctx, +inline bool is_not_divisible_by_7(int n) { +#ifndef SLOW_MODULO + return ((n % 7) != 0); + +#else + int sum; + + do + { + sum = 0; + + while(n) + { + sum += n & 7; + n = n >> 3; + } + n = sum; + } while(sum > 7); + + //Result to send back. + return !(sum == 7 || sum == 0); +#endif +} + +void insert_to_buffer(__local sha512_ctx * ctx, __local const uint8_t * string, const uint8_t len) { __local uint8_t *d; d = ctx->buffer->mem_08 + ctx->buflen; //ctx->buffer[buflen] (in char size) - memcpy_08(d, string, len); + memcpy(d, string, len); ctx->buflen += len; } @@ -77,36 +121,29 @@ void sha512_block(__local sha512_ctx * ctx) { uint64_t e = ctx->H[4]; uint64_t f = ctx->H[5]; uint64_t g = ctx->H[6]; - uint64_t h = ctx->H[7]; - + uint64_t h = ctx->H[7]; + uint64_t t1, t2; uint64_t w[16]; +#ifdef DEVICE_IS_CPU #pragma unroll 16 for (int i = 0; i < 16; i++) w[i] = SWAP64(ctx->buffer->mem_64[i]); - - uint64_t t1, t2; - #pragma unroll 16 - for (int i = 0; i < 16; i++) { - t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g); - t2 = Maj(a, b, c) + Sigma0(a); - - h = g; - g = f; - f = e; - e = d + t1; - d = c; - c = b; - b = a; - a = t1 + t2; - } - - #pragma unroll 64 - for (int i = 16; i < 80; i++) { - w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15]; +#else + ulong16 w_vector; + w_vector = vload16(0, ctx->buffer->mem_64); + w_vector = SWAP64(w_vector); + vstore16(w_vector, 0, w); +#endif + + #pragma unroll 80 + for (int i = 0; i < 80; i++) { + + if (i > 15) { + w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15]; + } t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g); t2 = Maj(a, b, c) + Sigma0(a); - h = g; g = f; f = e; @@ -128,9 +165,9 @@ void sha512_block(__local sha512_ctx * ctx) { } void ctx_append_1(__local sha512_ctx * ctx) { - uint32_t length = ctx->buflen; - int i = 127 - length; - __local uint8_t *d = ctx->buffer->mem_08 + length; + int i = 127 - ctx->buflen; + __local uint8_t * d = ctx->buffer->mem_08 + ctx->buflen; + *d++ = 0x80; while (i--) { @@ -149,7 +186,8 @@ void finish_ctx(__local sha512_ctx * ctx) { ctx->buflen = 0; } -void ctx_update(__local sha512_ctx * ctx, __local uint8_t *string, uint8_t len) { +void ctx_update(__local sha512_ctx * ctx, + __local uint8_t * string, uint8_t len) { ctx->total += len; uint8_t startpos = ctx->buflen; @@ -175,20 +213,22 @@ void clear_ctx_buffer(__local sha512_ctx * ctx) { ctx->buflen = 0; } -void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) { +void sha512_digest(__local sha512_ctx * ctx, + __local uint64_t * result) { if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block finish_ctx(ctx); } else { - uint8_t moved = 1; + bool moved = true; if (ctx->buflen < 128) { //data and 0x80 fits in one block ctx_append_1(ctx); - moved = 0; + moved = false; } sha512_block(ctx); clear_ctx_buffer(ctx); + if (moved) ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean ctx_add_length(ctx); @@ -200,72 +240,69 @@ void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) { result[i] = SWAP64(ctx->H[i]); } -void sha512crypt(__local working_memory * tmp_working, +void sha512crypt(__local working_memory * fast_tmp_memory, + __local crypt_sha512_salt * salt_data, __global crypt_sha512_hash * output) { -#define pass tmp_working->pass_info.v -#define passlength tmp_working->pass_info.length -#define salt tmp_working->salt_info.salt -#define saltlen tmp_working->salt_info.saltlen -#define rounds tmp_working->salt_info.rounds -#define alt_result tmp_working->alt_result -#define temp_result tmp_working->temp_result -#define s_sequence tmp_working->s_sequence -#define p_sequence tmp_working->p_sequence -#define ctx tmp_working->ctx_info +#define pass fast_tmp_memory->pass_data.pass +#define passlen fast_tmp_memory->pass_data.length +#define salt salt_data->salt +#define saltlen salt_data->length +#define rounds salt_data->rounds +#define alt_result fast_tmp_memory->alt_result +#define temp_result fast_tmp_memory->temp_result +#define p_sequence fast_tmp_memory->p_sequence +#define ctx fast_tmp_memory->ctx_data init_ctx(&ctx); - ctx_update(&ctx, pass, passlength); + ctx_update(&ctx, pass, passlen); ctx_update(&ctx, salt, saltlen); - ctx_update(&ctx, pass, passlength); + ctx_update(&ctx, pass, passlen); sha512_digest(&ctx, alt_result->mem_64); init_ctx(&ctx); - ctx_update(&ctx, pass, passlength); + ctx_update(&ctx, pass, passlen); ctx_update(&ctx, salt, saltlen); - ctx_update(&ctx, alt_result->mem_08, passlength); + ctx_update(&ctx, alt_result->mem_08, passlen); - for (int i = passlength; i > 0; i >>= 1) { + for (int i = passlen; i > 0; i >>= 1) { ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : pass), - ((i & 1) != 0 ? 64 : passlength)); + ((i & 1) != 0 ? 64 : passlen)); } sha512_digest(&ctx, alt_result->mem_64); init_ctx(&ctx); - for (int i = 0; i < passlength; i++) - ctx_update(&ctx, pass, passlength); - - sha512_digest(&ctx, temp_result->mem_64); - memcpy_64(p_sequence, temp_result, passlength); + for (int i = 0; i < passlen; i++) + ctx_update(&ctx, pass, passlen); + sha512_digest(&ctx, p_sequence->mem_64); init_ctx(&ctx); - /* For every character in the password add the entire password. */ + /* For every character in the password add the entire password. */ for (int i = 0; i < 16 + (alt_result->mem_08)[0]; i++) ctx_update(&ctx, salt, saltlen); /* Finish the digest. */ sha512_digest(&ctx, temp_result->mem_64); - memcpy_64(s_sequence, temp_result, saltlen); /* Repeatedly run the collected hash value through SHA512 to burn CPU cycles. */ for (int i = 0; i < rounds; i++) { init_ctx(&ctx); - ctx_update(&ctx, ((i & 1) != 0 ? p_sequence : alt_result->mem_08), - ((i & 1) != 0 ? passlength : 64)); + ctx_update(&ctx, ((i & 1) != 0 ? p_sequence->mem_08 : alt_result->mem_08), + ((i & 1) != 0 ? passlen : 64)); - if ((i % 3) != 0) - ctx_update(&ctx, s_sequence, saltlen); + if (is_not_divisible_by_3(i)) + ctx_update(&ctx, temp_result->mem_08, saltlen); - if ((i % 7) != 0) - ctx_update(&ctx, p_sequence, passlength); + if (is_not_divisible_by_7(i)) + ctx_update(&ctx, p_sequence->mem_08, passlen); - ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence), - ((i & 1) != 0 ? 64 : passlength)); + ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence->mem_08), + ((i & 1) != 0 ? 64 : passlen)); sha512_digest(&ctx, alt_result->mem_64); } //Send results to the host. @@ -276,54 +313,67 @@ void sha512crypt(__local working_memory * tmp_working, #undef salt #undef saltlen #undef rounds +#undef pass -__kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt, - __global crypt_sha512_password * inbuffer, - __global crypt_sha512_hash * outbuffer, - __local working_memory * tmp_memory) { +__kernel void kernel_crypt(__constant crypt_sha512_salt * informed_salt, + __global crypt_sha512_password * pass_data, + __global crypt_sha512_hash * out_buffer, + __local crypt_sha512_salt * salt_data, + __local working_memory * fast_tmp_memory) { //Get the task to be done - uint32_t gid = get_global_id(0); - uint32_t lid = get_local_id(0); + size_t gid = get_global_id(0); + size_t lid = get_local_id(0); //Transfer data to faster memory //Password information - tmp_memory[lid].pass_info.length = inbuffer[gid].length; + fast_tmp_memory[lid].pass_data.length = pass_data[gid].length; #pragma unroll PLAINTEXT_LENGTH for (int i = 0; i < PLAINTEXT_LENGTH; i++) - tmp_memory[lid].pass_info.v[i] = inbuffer[gid].v[i]; + fast_tmp_memory[lid].pass_data.pass[i] = pass_data[gid].pass[i]; - //Salt information. - tmp_memory[lid].salt_info.saltlen = hsalt->saltlen; - tmp_memory[lid].salt_info.rounds = hsalt->rounds; - - #pragma unroll SALT_SIZE - for (int i = 0; i < SALT_SIZE; i++) - tmp_memory[lid].salt_info.salt[i] = hsalt->salt[i]; + if (lid == 0){ + //Copy salt information to fast local memory. Only once in a group. + salt_data->length = informed_salt->length; + salt_data->rounds = informed_salt->rounds; + + #pragma unroll SALT_SIZE + for (int i = 0; i < SALT_SIZE; i++) + salt_data->salt[i] = informed_salt->salt[i]; + } //Do the job - sha512crypt(&tmp_memory[lid], &outbuffer[gid]); + sha512crypt(&fast_tmp_memory[lid], salt_data, &out_buffer[gid]); } /*** * To improve performance, it uses __local memory to keep working variables * (password, temp buffers, etc). In SHA 512 it means about 350 bytes per -* "thread". It improves performance a lot, but, local memory is a scarce +* "thread". It improves performance, but, local memory is a scarce * resource. * It means the max group size allowed in OpenCL SHA 512 is going to be -* 128 (hardware depended). +* 64 (it depends on hardware local memory size). * * Gain Optimizations * -- Basic version, private and global variables only. * Transfer all the working variables to local memory. -* -10% Move salt to constant memory space. Keep others in local (saves memory).IGNORED. +* -10% Move salt to constant memory space. Keep others in local (saves memory). BAD. * 25% Unrool main loops. * 5% Unrool other loops. -* -INF Do the compare task on GPU. +* ### Do the compare task on GPU. +* 5% Remove some unecessary code. +* ### Move almost everything to global and local memory. BAD. +* 1% Use vector types in SHA_Block in some variables. * * Conclusions * - Compare on GPU: CPU is more efficient for now. * - Salt on constant memory is not good enought. -* - No register spilling happens after optimization. -***/ +* - No register spilling happens after optimization. Although, might need to use less registers. +* - Tried to use "only" local and global memory. Got register spilling again. +* - Vectorized do not give better performance, but result in less instructions. +* In reality, I'm not doing vector operations (doing the same thing in n bytes), +* so should not expect big gains anyway. +* If i have a lot of memory, i might solve more than one hash at once +* (and use more vectors). But it is not possible (at least for a while). +***/ \ No newline at end of file diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h index e920c6e..7799c0e 100644 --- a/src/opencl_cryptsha512.h +++ b/src/opencl_cryptsha512.h @@ -18,7 +18,7 @@ #define uint8_t unsigned char #define uint16_t unsigned short #define uint32_t unsigned int -#define uint64_t ulong //Tip: unsigned long long int failed on compile (AMD). +#define uint64_t unsigned long //Tip: unsigned long long int failed on compile (AMD). //Functions. #define MAX(x,y) ((x) > (y) ? (x) : (y)) @@ -29,7 +29,7 @@ #define ROUNDS_MAX 999999999 #define SALT_SIZE 16 -#define PLAINTEXT_LENGTH 16 +#define PLAINTEXT_LENGTH 16 #define BINARY_SIZE (3+16+86) ///TODO: Magic number? #define KEYS_PER_CORE_CPU 512 @@ -37,8 +37,8 @@ #define MIN_KEYS_PER_CRYPT 128 #define MAX_KEYS_PER_CRYPT 2048*2048*128 -#define rol(x,n) rotate(x,n) -#define ror(x,n) rotate(x, (ulong) 64-n) +#define rol(x,n) rotate(x, n) +#define ror(x,n) rotate(x, (uint64_t) 64-n) #define Ch(x,y,z) ((x & y) ^ ( (~x) & z)) #define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) #define Sigma0(x) ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39))) @@ -46,7 +46,7 @@ #define sigma0(x) ((ror(x,1)) ^ (ror(x,8)) ^ (x>>7)) #define sigma1(x) ((ror(x,19)) ^ (ror(x,61)) ^ (x>>6)) -# define SWAP64(n) \ +#define SWAP64(n) \ (((n) << 56) \ | (((n) & 0xff00) << 40) \ | (((n) & 0xff0000) << 24) \ @@ -58,42 +58,39 @@ //Data types. typedef union { - uint8_t mem_08[8]; - uint16_t mem_16[4]; - uint32_t mem_32[2]; - uint64_t mem_64[1]; + uint8_t mem_08[8]; + uint16_t mem_16[4]; + uint32_t mem_32[2]; + uint64_t mem_64[1]; } buffer_64; typedef struct { - uint64_t H[8]; //512 bits - uint32_t total; - uint32_t buflen; - buffer_64 buffer[16]; //1024bits -} sha512_ctx; - -typedef struct { - uint32_t rounds; - uint8_t saltlen; - uint8_t salt[SALT_SIZE]; + uint32_t rounds; + uint32_t length; + uint8_t salt[SALT_SIZE]; } crypt_sha512_salt; typedef struct { - uint8_t length; - uint8_t v[PLAINTEXT_LENGTH]; + uint32_t length; + uint8_t pass[PLAINTEXT_LENGTH]; } crypt_sha512_password; typedef struct { - uint64_t v[8]; //512 bits + uint64_t v[8]; //512 bits } crypt_sha512_hash; typedef struct { - crypt_sha512_password pass_info; - crypt_sha512_salt salt_info; - sha512_ctx ctx_info; - buffer_64 alt_result[8]; - buffer_64 temp_result[8]; - uint8_t s_sequence[SALT_SIZE]; - uint8_t p_sequence[PLAINTEXT_LENGTH]; -} working_memory; + uint64_t H[8]; //512 bits + uint32_t total; + uint32_t buflen; + buffer_64 buffer[16]; //1024bits +} sha512_ctx; +typedef struct { + sha512_ctx ctx_data; + crypt_sha512_password pass_data; + buffer_64 alt_result[8]; + buffer_64 temp_result[8]; + buffer_64 p_sequence[8]; +} working_memory; #endif \ No newline at end of file diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c index a26c6ea..125263b 100644 --- a/src/opencl_cryptsha512_fmt.c +++ b/src/opencl_cryptsha512_fmt.c @@ -26,13 +26,13 @@ #define LWS_CONFIG "cryptsha512_LWS" #define KPC_CONFIG "cryptsha512_KPC" -static crypt_sha512_password *plaintext; // plaintext ciphertexts -static crypt_sha512_hash *out_hashes; // calculated hashes -static crypt_sha512_salt salt_data; +static crypt_sha512_salt salt; +static crypt_sha512_password *plaintext; // plaintext ciphertexts +static crypt_sha512_hash *calculated_hash; // calculated hashes -cl_mem salt_info; //Salt information. -cl_mem buffer_in; //Plaintext buffer. -cl_mem buffer_out; //Hash keys (output) +cl_mem salt_buffer; //Salt information. +cl_mem pass_buffer; //Plaintext buffer. +cl_mem hash_buffer; //Hash keys (output) cl_mem pinned_saved_keys, pinned_partial_hashes; cl_command_queue queue_prof; @@ -50,13 +50,26 @@ static struct fmt_tests tests[] = { {NULL} }; +size_t get_current_work_group_size(int dev_id) +{ + size_t max_group_size; + + HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id], + CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), + &max_group_size, NULL), + "Error querying clGetKernelWorkGroupInfo"); + + return max_group_size; +} + /* ------- Helper functions ------- */ uint get_task_max_work_group_size(){ uint max_available; - max_available = get_local_memory_size(gpu_id) / sizeof(working_memory); - - if (max_available > get_max_work_group_size(gpu_id)) - return get_max_work_group_size(gpu_id); + max_available = get_local_memory_size(gpu_id) / + (sizeof(working_memory) + sizeof(crypt_sha512_salt)); + + if (max_available > get_current_work_group_size(gpu_id)) + return get_current_work_group_size(gpu_id); return max_available; } @@ -88,38 +101,42 @@ static void create_clobj(int kpc) { sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes"); - out_hashes = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id], + calculated_hash = (crypt_sha512_hash *) clEnqueueMapBuffer(queue[gpu_id], pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(crypt_sha512_hash) * kpc, 0, NULL, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes"); // create arguments (buffers) - salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, + salt_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, sizeof(crypt_sha512_salt), NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating data_info out argument"); - buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, + pass_buffer = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, sizeof(crypt_sha512_password) * kpc, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys"); - buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, + hash_buffer = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, sizeof(crypt_sha512_hash) * kpc, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out"); //Set kernel arguments HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof (cl_mem), - (void *) &salt_info), "Error setting argument 0"); + (void *) &salt_buffer), "Error setting argument 0"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem), - (void *) &buffer_in), "Error setting argument 1"); + (void *) &pass_buffer), "Error setting argument 1"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem), - (void *) &buffer_out), "Error setting argument 2"); + (void *) &hash_buffer), "Error setting argument 2"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3, //Fast working memory. + sizeof (crypt_sha512_salt), + NULL), "Error setting argument 3"); + HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 4, //Fast working memory. sizeof (working_memory) * local_work_size, - NULL), "Error setting argument 3"); - + NULL), "Error setting argument 4"); + memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc); - salt_data.saltlen = 0; - salt_data.rounds = 0; + memset(salt.salt, '\0', SALT_SIZE); + salt.length = 0; + salt.rounds = 0; max_keys_per_crypt = kpc; } @@ -127,18 +144,18 @@ static void release_clobj(void) { cl_int ret_code; ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_partial_hashes, - out_hashes, 0, NULL, NULL); + calculated_hash, 0, NULL, NULL); HANDLE_CLERROR(ret_code, "Error Ummapping out_hashes"); ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_saved_keys, plaintext, 0, NULL, NULL); HANDLE_CLERROR(ret_code, "Error Ummapping saved_plain"); - ret_code = clReleaseMemObject(salt_info); + ret_code = clReleaseMemObject(salt_buffer); HANDLE_CLERROR(ret_code, "Error Releasing data_info"); - ret_code = clReleaseMemObject(buffer_in); + ret_code = clReleaseMemObject(pass_buffer); HANDLE_CLERROR(ret_code, "Error Releasing buffer_keys"); - ret_code = clReleaseMemObject(buffer_out); + ret_code = clReleaseMemObject(hash_buffer); HANDLE_CLERROR(ret_code, "Error Releasing buffer_out"); ret_code = clReleaseMemObject(pinned_saved_keys); @@ -152,13 +169,13 @@ static void release_clobj(void) { static void set_key(char *key, int index) { int len = strlen(key); plaintext[index].length = len; - memcpy(plaintext[index].v, key, len); + memcpy(plaintext[index].pass, key, len); new_keys = 1; } static char *get_key(int index) { static char ret[PLAINTEXT_LENGTH + 1]; - memcpy(ret, plaintext[index].v, PLAINTEXT_LENGTH); + memcpy(ret, plaintext[index].pass, PLAINTEXT_LENGTH); ret[plaintext[index].length] = '\0'; return ret; } @@ -192,10 +209,10 @@ static void find_best_workgroup(void) { for (i = 0; i < get_task_max_size(); i++) { set_key("aaabaabaaa", i); } - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0, - sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL), + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_TRUE, 0, + sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer I"); - HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_TRUE, 0, sizeof (crypt_sha512_password) * get_task_max_size(), plaintext, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer II"); @@ -256,7 +273,7 @@ static void find_best_kpc(void) { tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num); if (tmpbuffer == NULL) { - printf ("Failed in malloc inside find_best_kpc\n"); + printf ("Malloc failure in find_best_kpc\n"); exit (EXIT_FAILURE); } @@ -268,15 +285,15 @@ static void find_best_kpc(void) { for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0, - sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL), + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0, + sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer I"); - HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, buffer_in, CL_FALSE, 0, + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer II"); ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); - HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, buffer_out, CL_FALSE, 0, + HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0, sizeof (crypt_sha512_hash) * num, tmpbuffer, 0, NULL, NULL), "Failed in clEnqueueReadBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); @@ -354,15 +371,15 @@ static void init(struct fmt_main *pFmt) { if (max_keys_per_crypt) create_clobj(max_keys_per_crypt); - else { + else { //user chose to die of boredom max_keys_per_crypt = get_task_max_size(); create_clobj(max_keys_per_crypt); find_best_kpc(); } - printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", + printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", (int) local_work_size, max_keys_per_crypt); - pFmt->params.max_keys_per_crypt = max_keys_per_crypt; + pFmt->params.max_keys_per_crypt = max_keys_per_crypt; } /* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */ @@ -406,13 +423,13 @@ static void *get_salt(char *ciphertext) { return (void *) ret; } -static void set_salt(void *salt) { - unsigned char *s = salt; - int len = strlen(salt); - static char currentsalt[64]; - memcpy(currentsalt, s, len + 1); +static void set_salt(void *salt_info) { + int len = strlen(salt_info); unsigned char offset = 0; - salt_data.rounds = ROUNDS_DEFAULT; + static char currentsalt[64]; + + memcpy(currentsalt, (char *) salt_info, len + 1); + salt.rounds = ROUNDS_DEFAULT; if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0) offset += 3; @@ -424,13 +441,14 @@ static void set_salt(void *salt) { if (*endp == '$') { endp += 1; - salt_data.rounds = + salt.rounds = MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX)); } offset = endp - currentsalt; } - memcpy(salt_data.salt, currentsalt + offset, SALT_SIZE); - salt_data.saltlen = strlen((char *) salt_data.salt); + memcpy(salt.salt, currentsalt + offset, SALT_SIZE); + salt.length = strlen((char *) salt.salt); + salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length); } /* ------- To binary functions ------- */ @@ -496,7 +514,7 @@ static int cmp_all(void *binary, int count) { uint64_t b = ((uint64_t *) binary)[0]; for (i = 0; i < count; i++) - if (b == out_hashes[i].v[0]) + if (b == calculated_hash[i].v[0]) return 1; return 0; } @@ -506,7 +524,7 @@ static int cmp_one(void *binary, int index) { uint64_t *t = (uint64_t *) binary; for (i = 0; i < 8; i++) { - if (t[i] != out_hashes[index].v[i]) + if (t[i] != calculated_hash[index].v[i]) return 0; } return 1; @@ -519,11 +537,11 @@ static int cmp_exact(char *source, int count) { /* ------- Crypt function ------- */ static void crypt_all(int count) { //Send data to the dispositive - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0, - sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL), + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0, + sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL), "failed in clEnqueueWriteBuffer data_info"); if (new_keys) - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_in, CL_FALSE, 0, + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], pass_buffer, CL_FALSE, 0, sizeof(crypt_sha512_password) * max_keys_per_crypt, plaintext, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_in"); @@ -531,10 +549,10 @@ static void crypt_all(int count) { HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL, &max_keys_per_crypt, &local_work_size, 0, NULL, NULL), "failed in clEnqueueNDRangeKernel"); - + //Read back hashes - HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0, - sizeof(crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL), + HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], hash_buffer, CL_FALSE, 0, + sizeof(crypt_sha512_hash) * max_keys_per_crypt, calculated_hash, 0, NULL, NULL), "failed in reading data back"); //Do the work @@ -552,13 +570,13 @@ static int binary_hash_5(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF static int binary_hash_6(void * binary) { return *(ARCH_WORD_32 *) binary & 0x7FFFFFF; } //Get Hash functions group. -static int get_hash_0(int index) { return out_hashes[index].v[0] & 0xF; } -static int get_hash_1(int index) { return out_hashes[index].v[0] & 0xFF; } -static int get_hash_2(int index) { return out_hashes[index].v[0] & 0xFFF; } -static int get_hash_3(int index) { return out_hashes[index].v[0] & 0xFFFF; } -static int get_hash_4(int index) { return out_hashes[index].v[0] & 0xFFFFF; } -static int get_hash_5(int index) { return out_hashes[index].v[0] & 0xFFFFFF; } -static int get_hash_6(int index) { return out_hashes[index].v[0] & 0x7FFFFFF; } +static int get_hash_0(int index) { return calculated_hash[index].v[0] & 0xF; } +static int get_hash_1(int index) { return calculated_hash[index].v[0] & 0xFF; } +static int get_hash_2(int index) { return calculated_hash[index].v[0] & 0xFFF; } +static int get_hash_3(int index) { return calculated_hash[index].v[0] & 0xFFFF; } +static int get_hash_4(int index) { return calculated_hash[index].v[0] & 0xFFFFF; } +static int get_hash_5(int index) { return calculated_hash[index].v[0] & 0xFFFFFF; } +static int get_hash_6(int index) { return calculated_hash[index].v[0] & 0x7FFFFFF; } /* ------- Format structure ------- */ struct fmt_main fmt_opencl_cryptsha512 = { @@ -611,4 +629,4 @@ struct fmt_main fmt_opencl_cryptsha512 = { cmp_one, cmp_exact } -}; \ No newline at end of file +}; -- 1.7.5.4