>From e5462c8b0a9e8c895900dad4d43e21d51512cb51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Claudio=20Andr=C3=A9?= Date: Sun, 18 Mar 2012 07:58:35 -0300 Subject: [PATCH] Improvement: get the LWS and KPC from john's config file. Improvement: check if group size is valid. Cosmetic: changes in benchmark comment. See commit: 9d3fe1414e. Performance: put all working variables in __local memory address space. Fixed: allow temp buffer to use all the available local memory. Fixed: use small KPC values when running on CPU. On real cracking tasks, big values behaves badly. Fixed: format and algorithm names. Fixed: copyright information. --- run/john.conf | 4 + src/common-opencl.c | 41 +++++++ src/common-opencl.h | 5 + src/opencl/cryptsha512_kernel.cl | 233 ++++++++++++++++++++------------------ src/opencl_cryptsha512.h | 35 +++++- src/opencl_cryptsha512_fmt.c | 189 +++++++++++++++++++------------ 6 files changed, 320 insertions(+), 187 deletions(-) diff --git a/run/john.conf b/run/john.conf index 0304c5b..eda8ccf 100644 --- a/run/john.conf +++ b/run/john.conf @@ -81,6 +81,10 @@ Device = 0 #ssha_LWS = 512 #ssha_KPC = 8192 +# For Crypt sha-512. +cryptsha512_LWS = 64 +cryptsha512_KPC = 8192 + # A user defined character class is named with a single digit, ie. 0..9. After # the equal-sign, just list all characters that this class should match. You diff --git a/src/common-opencl.c b/src/common-opencl.c index 96dfec2..76e3048 100644 --- a/src/common-opencl.c +++ b/src/common-opencl.c @@ -127,6 +127,47 @@ void opencl_init(char *kernel_filename, unsigned int dev_id, build_kernel(dev_id); } +cl_ulong get_local_memory_size(int dev_id) +{ + cl_ulong size; + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(cl_ulong), &size, NULL), + "Error querying CL_DEVICE_LOCAL_MEM_SIZE"); + + return size; +} + +size_t get_max_work_group_size(int dev_id) +{ + size_t max_group_size; + + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof (max_group_size), &max_group_size, NULL), + "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE"); + + return max_group_size; +} + +cl_uint get_max_compute_units(int dev_id) +{ + cl_uint size; + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(cl_uint), &size, NULL), + "Error querying CL_DEVICE_LOCAL_MEM_SIZE"); + + return size; +} + +cl_device_type get_device_type(int dev_id) +{ + cl_device_type type; + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL), + "Error querying CL_DEVICE_LOCAL_MEM_SIZE"); + + return type; +} + char *get_error_name(cl_int cl_error) { static char *err_1[] = diff --git a/src/common-opencl.h b/src/common-opencl.h index 44d408e..1ba03d0 100644 --- a/src/common-opencl.h +++ b/src/common-opencl.h @@ -36,6 +36,11 @@ size_t max_group_size; void opencl_init(char *kernel_filename, unsigned int dev_id, unsigned int platform_id); +cl_ulong get_local_memory_size(int dev_id); +size_t get_max_work_group_size(int dev_id); +cl_uint get_max_compute_units(int dev_id); +cl_device_type get_device_type(int dev_id); + char *get_error_name(cl_int cl_error); void handle_clerror(cl_int cl_error, const char *message, const char *file, int line); diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl index 95e1f12..c8a0c86 100644 --- a/src/opencl/cryptsha512_kernel.cl +++ b/src/opencl/cryptsha512_kernel.cl @@ -1,11 +1,17 @@ /* -* This software is Copyright (c) 2011 Lukas Odzioba -* and it is hereby released to the general public under the following terms: -* Redistribution and use in source and binary forms, with or without modification, are permitted. -*/ + * Developed by Claudio André in 2012 + * Based on source code provided by Lukas Odzioba + * + * This software is: + * Copyright (c) 2011 Lukas Odzioba + * Copyright (c) 2012 Claudio André + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, are permitted. + * + * This program comes with ABSOLUTELY NO WARRANTY; express or implied . + */ #include "opencl_cryptsha512.h" -//#pragma OPENCL EXTENSION cl_amd_printf : enable __constant uint64_t k[] = { 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, @@ -30,7 +36,7 @@ __constant uint64_t k[] = { 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, }; -void init_ctx(sha512_ctx * ctx) { +void init_ctx(__local sha512_ctx * ctx) { ctx->H[0] = 0x6a09e667f3bcc908UL; ctx->H[1] = 0xbb67ae8584caa73bUL; ctx->H[2] = 0x3c6ef372fe94f82bUL; @@ -43,24 +49,27 @@ void init_ctx(sha512_ctx * ctx) { ctx->buflen = 0; } -void memcpy_1(uint8_t * dest, const uint8_t * src, const size_t n) { +void memcpy_08(__local uint8_t * dest, __local const uint8_t * src, const size_t n) { for (int i = 0; i < n; i++) dest[i] = src[i]; } -void memcpy (uint8_t * dest, buffer_64 * src, const size_t n) { +void memcpy_64(__local uint8_t * dest, __local buffer_64 * src, const size_t n) { for (int i = 0; i < n; i++) dest[i] = src->mem_08[i]; } -void insert_to_buffer(sha512_ctx * ctx, const uint8_t * string, +void insert_to_buffer(__local sha512_ctx * ctx, + __local const uint8_t * string, const uint8_t len) { - uint8_t *d = ctx->buffer->mem_08 + ctx->buflen; //Position ctx->buffer[buflen] (in char size) - memcpy_1(d, string, len); + __local uint8_t *d; + d = ctx->buffer->mem_08 + ctx->buflen; //ctx->buffer[buflen] (in char size) + + memcpy_08(d, string, len); ctx->buflen += len; } -void sha512_block(sha512_ctx * ctx) { +void sha512_block(__local sha512_ctx * ctx) { int i; uint64_t a = ctx->H[0]; uint64_t b = ctx->H[1]; @@ -73,13 +82,14 @@ void sha512_block(sha512_ctx * ctx) { uint64_t w[16]; - uint64_t *data = ctx->buffer->mem_64; //The same as buffer[0] - //#pragma unroll 16 + __local uint64_t *data = ctx->buffer->mem_64; //The same as buffer[0] + + #pragma unroll 16 for (i = 0; i < 16; i++) w[i] = SWAP64(data[i]); uint64_t t1, t2; - //#pragma unroll 16 + #pragma unroll 16 for (i = 0; i < 16; i++) { t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g); t2 = Maj(a, b, c) + Sigma0(a); @@ -94,7 +104,7 @@ void sha512_block(sha512_ctx * ctx) { a = t1 + t2; } - + #pragma unroll 64 for (i = 16; i < 80; i++) { w[i & 15] = sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i - 16) & 15] + w[(i - 7) & 15]; t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g); @@ -120,51 +130,36 @@ void sha512_block(sha512_ctx * ctx) { ctx->H[7] += h; } -void ctx_append_1(sha512_ctx * ctx) { +void ctx_append_1(__local sha512_ctx * ctx) { uint32_t length = ctx->buflen; int i = 127 - length; - uint8_t *d = ctx->buffer->mem_08 + length; + __local uint8_t *d = ctx->buffer->mem_08 + length; *d++ = 0x80; while (i--) { d[i] = 0; } - -/* TODO: - while( length%4!=0) -160 { *d =0; -161 i--; -162 } -163 x=(uint32_t*)d; -164 while(i>0) -165 { i-=4; -166 *x =0; -167 } -*/ } -void ctx_add_length(sha512_ctx * ctx) { - uint64_t *blocks = ctx->buffer->mem_64; +void ctx_add_length(__local sha512_ctx * ctx) { + __local uint64_t *blocks = ctx->buffer->mem_64; blocks[15] = SWAP64((uint64_t) (ctx->total * 8)); } -void finish_ctx(sha512_ctx * ctx) { +void finish_ctx(__local sha512_ctx * ctx) { ctx_append_1(ctx); ctx_add_length(ctx); ctx->buflen = 0; } -void ctx_update(sha512_ctx * ctx, uint8_t *string, uint8_t len) { +void ctx_update(__local sha512_ctx * ctx, __local uint8_t *string, uint8_t len) { + ctx->total += len; uint8_t startpos = ctx->buflen; - uint8_t partsize; - if (startpos + len <= 128) { - partsize = len; - } else - partsize = 128 - startpos; - - insert_to_buffer(ctx, string, partsize); - if (ctx->buflen == 128) { + + insert_to_buffer(ctx, string, (startpos + len <= 128 ? len : 128 - startpos)); + + if (ctx->buflen == 128) { //Branching. uint8_t offset = 128 - startpos; sha512_block(ctx); ctx->buflen = 0; @@ -172,23 +167,25 @@ void ctx_update(sha512_ctx * ctx, uint8_t *string, uint8_t len) { } } -void clear_ctx_buffer(sha512_ctx * ctx) { +void clear_ctx_buffer(__local sha512_ctx * ctx) { - uint32_t *w = ctx->buffer->mem_32; - //#pragma unroll 30 - for (int i = 0; i < 30; i++) //TODO: why 30? Not 32? + __local uint32_t *w = ctx->buffer->mem_32; + + #pragma unroll 32 + for (int i = 0; i < 32; i++) w[i] = 0; ctx->buflen = 0; } -void sha512_digest(sha512_ctx * ctx, uint64_t * result) { - uint8_t i; +void sha512_digest(__local sha512_ctx * ctx, __local uint64_t * result) { + if (ctx->buflen <= 111) { //data+0x80+datasize fits in one 1024bit block finish_ctx(ctx); - sha512_block(ctx); + } else { uint8_t moved = 1; + if (ctx->buflen < 128) { //data and 0x80 fits in one block ctx_append_1(ctx); moved = 0; @@ -198,118 +195,138 @@ void sha512_digest(sha512_ctx * ctx, uint64_t * result) { if (moved) ctx->buffer->mem_08[0] = 0x80; //append 1,the rest is already clean ctx_add_length(ctx); - sha512_block(ctx); } - //#pragma unroll 8 - for (i = 0; i < 8; i++) + sha512_block(ctx); + + #pragma unroll 8 + for (int i = 0; i < 8; i++) result[i] = SWAP64(ctx->H[i]); } -void sha512crypt(uint8_t *pass, uint8_t passlength, - crypt_sha512_salt cuda_salt, +void sha512crypt(__local working_memory * tmp_working, __global crypt_sha512_hash * output) { - buffer_64 alt_result[8], temp_result[8]; - int i; - sha512_ctx ctx; +#define pass tmp_working->pass_info.v +#define passlength tmp_working->pass_info.length +#define salt tmp_working->salt_info.salt +#define saltlen tmp_working->salt_info.saltlen +#define rounds tmp_working->salt_info.rounds +#define alt_result tmp_working->alt_result +#define temp_result tmp_working->temp_result +#define s_sequence tmp_working->s_sequence +#define p_sequence tmp_working->p_sequence +#define ctx tmp_working->ctx_info + init_ctx(&ctx); ctx_update(&ctx, pass, passlength); - ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen); + ctx_update(&ctx, salt, saltlen); ctx_update(&ctx, pass, passlength); sha512_digest(&ctx, alt_result->mem_64); init_ctx(&ctx); ctx_update(&ctx, pass, passlength); - ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen); + ctx_update(&ctx, salt, saltlen); ctx_update(&ctx, alt_result->mem_08, passlength); - for (i = passlength; i > 0; i >>= 1) { - if ((i & 1) != 0) - ctx_update(&ctx, alt_result->mem_08, 64); - else - ctx_update(&ctx, pass, passlength); + for (int i = passlength; i > 0; i >>= 1) { + ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : pass), + ((i & 1) != 0 ? 64 : passlength)); } sha512_digest(&ctx, alt_result->mem_64); init_ctx(&ctx); - for (i = 0; i < passlength; i++) + for (int i = 0; i < passlength; i++) ctx_update(&ctx, pass, passlength); sha512_digest(&ctx, temp_result->mem_64); - - uint8_t sp_sequence[16 + 4]; - uint8_t *p_sequence = sp_sequence; - memcpy(p_sequence, temp_result, passlength); + memcpy_64(p_sequence, temp_result, passlength); init_ctx(&ctx); /* For every character in the password add the entire password. */ - for (i = 0; i < 16 + (alt_result->mem_08)[0]; i++) //Analyse, TÁ CERTO?### - ctx_update(&ctx, cuda_salt.salt, cuda_salt.saltlen); + for (int i = 0; i < 16 + (alt_result->mem_08)[0]; i++) + ctx_update(&ctx, salt, saltlen); /* Finish the digest. */ sha512_digest(&ctx, temp_result->mem_64); - - uint8_t saltlength = cuda_salt.saltlen; - - uint8_t ss_sequence[16 + 4]; - uint8_t *s_sequence = ss_sequence; - memcpy(s_sequence, temp_result, saltlength); + memcpy_64(s_sequence, temp_result, saltlen); /* Repeatedly run the collected hash value through SHA512 to burn CPU cycles. */ - for (i = 0; i < cuda_salt.rounds; i++) { + for (int i = 0; i < rounds; i++) { init_ctx(&ctx); - if ((i & 1) != 0) - ctx_update(&ctx, p_sequence, passlength); - else - ctx_update(&ctx, alt_result->mem_08, 64); + ctx_update(&ctx, ((i & 1) != 0 ? p_sequence : alt_result->mem_08), + ((i & 1) != 0 ? passlength : 64)); if ((i % 3) != 0) - ctx_update(&ctx, s_sequence, saltlength); + ctx_update(&ctx, s_sequence, saltlen); if ((i % 7) != 0) ctx_update(&ctx, p_sequence, passlength); - if ((i & 1) != 0) - ctx_update(&ctx, alt_result->mem_08, 64); - else - ctx_update(&ctx, p_sequence, passlength); - + ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence), + ((i & 1) != 0 ? 64 : passlength)); sha512_digest(&ctx, alt_result->mem_64); } //Send results to the host. - //#pragma unroll 8 - for (i = 0; i < 8; i++) - output->v[i] = alt_result[i].mem_64[0]; + #pragma unroll 8 + for (int i = 0; i < 8; i++) + output->v[i] = alt_result[i].mem_64[0]; } +#undef salt +#undef saltlen +#undef rounds __kernel void kernel_crypt(__constant crypt_sha512_salt * hsalt, - __constant crypt_sha512_password * inbuffer, - __global crypt_sha512_hash * outbuffer) { - - uint8_t pass[PLAINTEXT_LENGTH]; - crypt_sha512_salt salt_data; + __global crypt_sha512_password * inbuffer, + __global crypt_sha512_hash * outbuffer, + __local working_memory * tmp_memory) { //Get the task to be done - uint32_t idx = get_global_id(0); + uint32_t gid = get_global_id(0); + uint32_t lid = get_local_id(0); - //Use fast memory. + //Transfer data to faster memory + //Password information + tmp_memory[lid].pass_info.length = inbuffer[gid].length; - //Get password information, put in faster memory. - for (int i = 0; i < inbuffer[idx].length; i++) - pass[i] = inbuffer[idx].v[i]; - - //Get salt information, put in faster memory. - salt_data.saltlen = hsalt->saltlen; - salt_data.rounds = hsalt->rounds; + #pragma unroll PLAINTEXT_LENGTH + for (int i = 0; i < PLAINTEXT_LENGTH; i++) + tmp_memory[lid].pass_info.v[i] = inbuffer[gid].v[i]; + + //Salt information. + tmp_memory[lid].salt_info.saltlen = hsalt->saltlen; + tmp_memory[lid].salt_info.rounds = hsalt->rounds; - for (int i = 0; i < salt_data.saltlen; i++) - salt_data.salt[i] = hsalt->salt[i]; + #pragma unroll SALT_SIZE + for (int i = 0; i < SALT_SIZE; i++) + tmp_memory[lid].salt_info.salt[i] = hsalt->salt[i]; //Do the job - sha512crypt(pass, inbuffer[idx].length, salt_data, &outbuffer[idx]); + sha512crypt(&tmp_memory[lid], &outbuffer[gid]); } + +/*** +* To improve performance, it uses __local memory to keep working variables +* (password, temp buffers, etc). In SHA 512 it means about 350 bytes per +* "thread". It improves performance a lot, but, local memory is a scarce +* resource. +* It means the max group size allowed in OpenCL SHA 512 is going to be +* 128 (hardware depended). +* +* Gain Optimizations +* -- Basic version, private and global variables only. +* Transfer all the working variables to local memory. +* -10% Move salt to constant memory space. Keep others in local (saves memory).IGNORED. +* 25% Unrool main loops. +* 5% Unrool other loops. +* -INF Do the compare task on GPU. +* +* Conclusions +* - Compare on GPU: CPU is more efficient for now. +* - Salt on constant memory is not good enought. +* - No register spilling happens after optimization. +***/ diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h index f8f1852..c0b5f6a 100644 --- a/src/opencl_cryptsha512.h +++ b/src/opencl_cryptsha512.h @@ -1,12 +1,20 @@ /* -* This software is Copyright (c) 2011 Lukas Odzioba -* and it is hereby released to the general public under the following terms: -* Redistribution and use in source and binary forms, with or without modification, are permitted. -*/ + * Developed by Claudio André in 2012 + * Based on source code provided by Lukas Odzioba + * + * This software is: + * Copyright (c) 2011 Lukas Odzioba + * Copyright (c) 2012 Claudio André + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, are permitted. + * + * This program comes with ABSOLUTELY NO WARRANTY; express or implied . + */ + #ifndef _CRYPTSHA512_H #define _CRYPTSHA512_H -//Type names definition. ///TODO: move to a new file and share this new file where needed. +//Type names definition. #define uint8_t unsigned char #define uint16_t unsigned short #define uint32_t unsigned int @@ -22,7 +30,12 @@ #define SALT_SIZE 16 #define PLAINTEXT_LENGTH 16 -#define KEYS_PER_CRYPT 1024*2048 +#define BINARY_SIZE (3+16+86) ///TODO: Magic number? + +#define KEYS_PER_CORE_CPU 512 +#define KEYS_PER_CORE_GPU 1024 +#define MIN_KEYS_PER_CRYPT 128 +#define MAX_KEYS_PER_CRYPT 2048*2048*128 #define rol(x,n) ((x << n) | (x >> (64-n))) #define ror(x,n) ((x >> n) | (x << (64-n))) @@ -76,4 +89,14 @@ typedef struct { uint64_t v[8]; //512 bits } crypt_sha512_hash; +typedef struct { + crypt_sha512_password pass_info; + crypt_sha512_salt salt_info; + sha512_ctx ctx_info; + buffer_64 alt_result[8]; + buffer_64 temp_result[8]; + uint8_t s_sequence[SALT_SIZE]; + uint8_t p_sequence[PLAINTEXT_LENGTH]; +} working_memory; + #endif \ No newline at end of file diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c index dfaa949..2f49260 100644 --- a/src/opencl_cryptsha512_fmt.c +++ b/src/opencl_cryptsha512_fmt.c @@ -1,8 +1,10 @@ /* - * Copyright (c) 2011 Samuele Giovanni Tonon - * samu at linuxasylum dot net - * This program comes with ABSOLUTELY NO WARRANTY; express or - * implied . + * Developed by Claudio André in 2012 + * Based on source code provided by Samuele Giovanni Tonon + * + * Copyright (c) 2011 Samuele Giovanni Tonon + * Copyright (c) 2012 Claudio André + * This program comes with ABSOLUTELY NO WARRANTY; express or implied . * This is free software, and you are welcome to redistribute it * under certain conditions; as expressed here * http://www.gnu.org/licenses/gpl-2.0.html @@ -10,24 +12,19 @@ #include #include "common-opencl.h" +#include "config.h" #include "opencl_cryptsha512.h" -#define FORMAT_LABEL "cryptsha512-opencl" -#define FORMAT_NAME "crypt SHA-512 OpenCL" +#define FORMAT_LABEL "cryptsha512-opencl" +#define FORMAT_NAME "crypt SHA-512" +#define ALGORITHM_NAME "OpenCL" +#define SHA_TYPE "SHA512" -#if ARCH_BITS >= 64 -#define ALGORITHM_NAME "OpenSSL 64/" ARCH_BITS_STR -#else -#define ALGORITHM_NAME "OpenSSL 32/" ARCH_BITS_STR -#endif - -#define BENCHMARK_COMMENT " rounds=5000" +#define BENCHMARK_COMMENT " (rounds=5000)" #define BENCHMARK_LENGTH -1 -#define BINARY_SIZE (3+16+86) ///TODO: Magic number? - -#define MIN_KEYS_PER_CRYPT 1024 -#define MAX_KEYS_PER_CRYPT KEYS_PER_CRYPT +#define LWS_CONFIG "cryptsha512_LWS" +#define KPC_CONFIG "cryptsha512_KPC" static crypt_sha512_password *plaintext; // plaintext ciphertexts static crypt_sha512_hash *out_hashes; // calculated hashes @@ -41,7 +38,7 @@ cl_mem pinned_saved_keys, pinned_partial_hashes; cl_command_queue queue_prof; cl_kernel crypt_kernel; -static size_t max_keys_per_crypt = KEYS_PER_CRYPT; +static size_t max_keys_per_crypt; //TODO: move to common-opencl? local_work_size is there. static struct fmt_tests tests[] = { {"$6$saltstring$svn8UoSVapNtMuq1ukKS4tPQd8iKwSMHWjl/O817G3uBnIFNjnQJuesI68u4OTLiBFdcbYEdFCoEOfaS35inz1", "Hello world!"}, @@ -52,6 +49,27 @@ static struct fmt_tests tests[] = { {NULL} }; +/* ------- Helper functions ------- */ +uint get_task_max_work_group_size(){ + uint max_available; + max_available = get_local_memory_size(gpu_id) / sizeof(working_memory); + + if (max_available > get_max_work_group_size(gpu_id)) + return get_max_work_group_size(gpu_id); + + return max_available; +} + +uint get_task_max_size(){ + uint max_available; + max_available = get_max_compute_units(gpu_id); + + if (get_device_type(gpu_id) == CL_DEVICE_TYPE_CPU) + return max_available * KEYS_PER_CORE_CPU; + + return max_available * KEYS_PER_CORE_GPU; +} + /* ------- Create and destroy necessary objects ------- */ static void create_clobj(int kpc) { pinned_saved_keys = clCreateBuffer(context[gpu_id], @@ -78,7 +96,7 @@ static void create_clobj(int kpc) { salt_info = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, sizeof(crypt_sha512_salt), NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating data_info out argument"); - + buffer_in = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, sizeof(crypt_sha512_password) * kpc, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys"); @@ -93,8 +111,11 @@ static void create_clobj(int kpc) { HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof (cl_mem), (void *) &buffer_in), "Error setting argument 1"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof (cl_mem), - (void *) &buffer_out), "Error setting argument 2"); - + (void *) &buffer_out), "Error setting argument 2"); + HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3, //Fast working memory. + sizeof (working_memory) * local_work_size, + NULL), "Error setting argument 3"); + memset(plaintext, '\0', sizeof(crypt_sha512_password) * kpc); salt_data.saltlen = 0; salt_data.rounds = 0; @@ -143,7 +164,12 @@ static char *get_key(int index) { /* ------- Try to find the best configuration ------- */ /* -- This function could be used to calculated the best num - of keys per crypt for the given format + for the workgroup + Work-items that make up a work-group (also referred to + as the size of the work-group) + LWS should never be a big number since every work-item + uses about 400 bytes of local memory. Local memory + is usually 32 KB -- */ static void find_best_workgroup(void) { cl_event myEvent; @@ -153,24 +179,25 @@ static void find_best_workgroup(void) { int i; size_t max_group_size; - clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_WORK_GROUP_SIZE, - sizeof (max_group_size), &max_group_size, NULL); + max_group_size = get_max_work_group_size(gpu_id); queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); printf("Max Group Work Size %d ", (int) max_group_size); local_work_size = 1; // Set keys - for (i = 0; i < KEYS_PER_CRYPT; i++) { + for (i = 0; i < get_task_max_size(); i++) { set_key("aaabaabaaa", i); } clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_TRUE, 0, sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL); clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, - sizeof (crypt_sha512_password) * KEYS_PER_CRYPT, plaintext, 0, NULL, NULL); + sizeof (crypt_sha512_password) * get_task_max_size(), + plaintext, 0, NULL, NULL); // Find minimum time - for (my_work_group = 1; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) { + for (my_work_group = 1; (int) my_work_group <= (int) get_task_max_work_group_size(); + my_work_group *= 2) { ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &max_keys_per_crypt, &my_work_group, 0, NULL, &myEvent); clFinish(queue_prof); @@ -184,14 +211,17 @@ static void find_best_workgroup(void) { sizeof (cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endTime, NULL); - + clReleaseEvent (myEvent); + if ((endTime - startTime) < kernelExecTimeNs) { kernelExecTimeNs = endTime - startTime; local_work_size = my_work_group; } } printf("Optimal local work size %d\n", (int) local_work_size); - printf("(to avoid this test on next run do export LWS=%d)\n", (int) local_work_size); + printf("(to avoid this test on next run, put \"" + LWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS + SUBSECTION_OPENCL "])\n", (int)local_work_size); clReleaseCommandQueue(queue_prof); } @@ -200,10 +230,10 @@ static void find_best_workgroup(void) { of keys per crypt for the given format -- */ static void find_best_kpc(void) { - int num; + size_t num; cl_event myEvent; cl_ulong startTime, endTime, tmpTime; - int kernelExecTimeNs = 6969; + cl_ulong kernelExecTimeNs = CL_ULONG_MAX; cl_int ret_code; int optimal_kpc = MIN_KEYS_PER_CRYPT; int i; @@ -211,26 +241,28 @@ static void find_best_kpc(void) { printf("Calculating best keys per crypt, this will take a while "); - for (num = MAX_KEYS_PER_CRYPT; num > MIN_KEYS_PER_CRYPT; num -= 4096) { + for (num = get_task_max_size(); (int) num > MIN_KEYS_PER_CRYPT; num -= 4096) { release_clobj(); create_clobj(num); advance_cursor(); + tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num); queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); - + // Set keys for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } clEnqueueWriteBuffer(queue[gpu_id], salt_info, CL_FALSE, 0, sizeof (crypt_sha512_salt), &salt_data, 0, NULL, NULL); - clEnqueueWriteBuffer(queue_prof, buffer_in, CL_TRUE, 0, - sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL); - + clEnqueueWriteBuffer(queue_prof, buffer_in, CL_FALSE, 0, + sizeof (crypt_sha512_password) * num, plaintext, 0, NULL, NULL); ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, - 1, NULL, &max_keys_per_crypt, &local_work_size, 0, NULL, &myEvent); + 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); + clEnqueueReadBuffer(queue_prof, buffer_out, CL_FALSE, 0, + sizeof (crypt_sha512_hash) * num, tmpbuffer, 0, NULL, NULL); clFinish(queue_prof); - + if (ret_code != CL_SUCCESS) { printf("Error %d\n", ret_code); continue; @@ -240,17 +272,9 @@ static void find_best_kpc(void) { clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endTime, NULL); + clReleaseEvent (myEvent); tmpTime = endTime - startTime; - tmpbuffer = malloc(sizeof (cl_uint) * num); - - clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, - sizeof (cl_uint) * num, tmpbuffer, 0, NULL, &myEvent); - clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, - sizeof (cl_ulong), &startTime, NULL); - clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, - sizeof (cl_ulong), &endTime, NULL); - tmpTime = tmpTime + (endTime - startTime); - + if (((int) (((float) (tmpTime) / num) * 10)) <= kernelExecTimeNs) { kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10)); optimal_kpc = num; @@ -259,8 +283,9 @@ static void find_best_kpc(void) { clReleaseCommandQueue(queue_prof); } printf("Optimal keys per crypt %d\n", optimal_kpc); - printf("(to avoid this test on next run do \"export KPC=%d\")\n", optimal_kpc); - + printf("to avoid this test on next run, put \"" + KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS + SUBSECTION_OPENCL "])\n", optimal_kpc); max_keys_per_crypt = optimal_kpc; release_clobj(); create_clobj(optimal_kpc); @@ -268,36 +293,54 @@ static void find_best_kpc(void) { /* ------- Initialization ------- */ static void init(struct fmt_main *pFmt) { - char *kpc; + char *tmp_value; opencl_init("$JOHN/cryptsha512_kernel.cl", gpu_id, platform_id); + max_keys_per_crypt = get_task_max_size(); + local_work_size = 0; // create kernel to execute crypt_kernel = clCreateKernel(program[gpu_id], "kernel_crypt", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); - if (((kpc = getenv("LWS")) == NULL) || (atoi(kpc) == 0)) { - create_clobj(KEYS_PER_CRYPT); + if ((tmp_value = cfg_get_param(SECTION_OPTIONS, + SUBSECTION_OPENCL, LWS_CONFIG))) + local_work_size = atoi(tmp_value); + + if ((tmp_value = getenv("LWS"))) + local_work_size = atoi(tmp_value); + + //Check if local_work_size is a valid number. + if (local_work_size > get_task_max_work_group_size()){ + printf("Error: invalid local work size (LWS). Max value allowed is: %u\n" , + get_task_max_work_group_size()); + local_work_size = 0; //Force find a valid number. + } + + if (!local_work_size) { + local_work_size = get_task_max_work_group_size(); + create_clobj(max_keys_per_crypt); find_best_workgroup(); release_clobj(); - } else { - local_work_size = atoi(kpc); } - if ((kpc = getenv("KPC")) == NULL) { - max_keys_per_crypt = KEYS_PER_CRYPT; - create_clobj(KEYS_PER_CRYPT); - } else { - if (atoi(kpc) == 0) { - //user chose to die of boredom - max_keys_per_crypt = KEYS_PER_CRYPT; - create_clobj(KEYS_PER_CRYPT); - find_best_kpc(); - } else { - max_keys_per_crypt = atoi(kpc); - create_clobj(max_keys_per_crypt); - } + + if ((tmp_value = cfg_get_param(SECTION_OPTIONS, + SUBSECTION_OPENCL, KPC_CONFIG))) + max_keys_per_crypt = atoi(tmp_value); + + if ((tmp_value = getenv("KPC"))) + max_keys_per_crypt = atoi(tmp_value); + + if (max_keys_per_crypt) + create_clobj(max_keys_per_crypt); + + else { + //user chose to die of boredom + max_keys_per_crypt = get_task_max_size(); + create_clobj(max_keys_per_crypt); + find_best_kpc(); } printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", - (int) local_work_size, max_keys_per_crypt); + (int) local_work_size, max_keys_per_crypt); pFmt->params.max_keys_per_crypt = max_keys_per_crypt; } @@ -365,7 +408,7 @@ static void set_salt(void *salt) { } offset = endp - currentsalt; } - memcpy(salt_data.salt, currentsalt + offset, 16); + memcpy(salt_data.salt, currentsalt + offset, SALT_SIZE); salt_data.saltlen = strlen((char *) salt_data.salt); } @@ -469,7 +512,7 @@ static void crypt_all(int count) { //Read back hashes HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_FALSE, 0, - sizeof (crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL), + sizeof(crypt_sha512_hash) * max_keys_per_crypt, out_hashes, 0, NULL, NULL), "failed in reading data back"); //Do the work @@ -477,7 +520,7 @@ static void crypt_all(int count) { } /* ------- Binary Hash functions group ------- */ -static int binary_hash_0(void * binary) { return *(ARCH_WORD_32 *) binary & 0xF; } +static int binary_hash_0(void * binary) { return *(ARCH_WORD_32 *) binary & 0xF; } static int binary_hash_1(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFF; } static int binary_hash_2(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFF; } static int binary_hash_3(void * binary) { return *(ARCH_WORD_32 *) binary & 0xFFFF; } @@ -545,4 +588,4 @@ struct fmt_main fmt_opencl_cryptsha512 = { cmp_one, cmp_exact } -}; +}; \ No newline at end of file -- 1.7.5.4