diff --git a/run/john.conf b/run/john.conf index 23cec4c..7ed0ab7 100644 --- a/run/john.conf +++ b/run/john.conf @@ -86,8 +86,8 @@ Device = 0 #rar_KPC = 8192 # For Crypt sha-512. -cryptsha512_LWS = 64 -cryptsha512_KPC = 8192 +#cryptsha512_LWS = 64 +#cryptsha512_KPC = 5120 # A user defined character class is named with a single digit, ie. 0..9. After diff --git a/src/common-opencl.c b/src/common-opencl.c index f1b6cfa..f26c189 100644 --- a/src/common-opencl.c +++ b/src/common-opencl.c @@ -8,6 +8,8 @@ static char opencl_log[LOG_SIZE]; static char *kernel_source; static int kernel_loaded; +static int device_info; +static int cores_per_MP; void advance_cursor() { static int pos=0; @@ -85,30 +87,20 @@ static void dev_init(unsigned int dev_id, unsigned int platform_id) HANDLE_CLERROR(ret_code, "Error creating command queue"); } -cl_device_type get_device_type(int dev_id) -{ - cl_device_type type; - HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE, - sizeof(cl_device_type), &type, NULL), - "Error querying CL_DEVICE_TYPE"); - - return type; -} - static char * include_source(char *pathname, int dev_id) { static char include[PATH_BUFFER_SIZE]; - sprintf(include, "-I %s %s %s", path_expand(pathname), + sprintf(include, "-I %s %s %s%d %s", path_expand(pathname), get_device_type(dev_id) == CL_DEVICE_TYPE_CPU ? "-DDEVICE_IS_CPU" : "", + "-DDEVICE_INFO=", device_info, "-cl-strict-aliasing -cl-mad-enable"); //fprintf(stderr, "Options used: %s\n", include); return include; } - static void build_kernel(int dev_id) { assert(kernel_loaded); @@ -164,13 +156,54 @@ static void build_kernel(int dev_id) #endif } +void opencl_get_dev_info(unsigned int dev_id) +{ + cl_device_type device; + + device = get_device_type(dev_id); + + if (device == CL_DEVICE_TYPE_CPU) + device_info = CPU; + else if (device == CL_DEVICE_TYPE_GPU) + device_info = GPU; + else if (device == CL_DEVICE_TYPE_ACCELERATOR) + device_info = ACCELERATOR; + + device_info += get_vendor_id(dev_id); + device_info += get_processor_family(dev_id); +} + +void opencl_init_dev(unsigned int dev_id, unsigned int platform_id) +{ + dev_init(dev_id, platform_id); + opencl_get_dev_info(dev_id); +} + +void opencl_build_kernel(char *kernel_filename, unsigned int dev_id) +{ + read_kernel_source(kernel_filename); + build_kernel(dev_id); +} + void opencl_init(char *kernel_filename, unsigned int dev_id, unsigned int platform_id) { - //if (!kernel_loaded) - read_kernel_source(kernel_filename); - dev_init(dev_id, platform_id); - build_kernel(dev_id); + opencl_init_dev(dev_id, platform_id); + opencl_build_kernel(kernel_filename, dev_id); +} + +int get_device_info(){ + return device_info; +} + +cl_device_type get_device_type(int dev_id) +{ + cl_device_type type; + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL), + "Error querying CL_DEVICE_TYPE"); + + return type; } cl_ulong get_local_memory_size(int dev_id) @@ -194,6 +227,17 @@ size_t get_max_work_group_size(int dev_id) return max_group_size; } +size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel) { + size_t max_group_size; + + HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id], + CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), + &max_group_size, NULL), + "Error querying clGetKernelWorkGroupInfo"); + + return max_group_size; +} + cl_uint get_max_compute_units(int dev_id) { cl_uint size; @@ -204,6 +248,95 @@ cl_uint get_max_compute_units(int dev_id) return size; } +cl_uint get_processors_count(int dev_id) +{ + int major = 0, minor = 0; + cl_uint core_count = get_max_compute_units(dev_id); + + if (gpu_nvidia(device_info)) { + //oclGetDevCap(devices[dev_id], &major, &minor); + + if (major == 1) + core_count *= (cores_per_MP = 8); + else if (major == 2 && minor == 0) + core_count *= (cores_per_MP = 32); //2.0 + else if (major == 2 && minor >= 1) + core_count *= (cores_per_MP = 48); //2.1 and up + else if (major == 3) + core_count *= (cores_per_MP = 192); //3.0 and up + else + core_count *= (cores_per_MP = 192); //Future use + + if (major == 9999 && minor == 9999) + core_count = 0; + } + else if (gpu_amd(device_info)) { + core_count *= 16 * //16 thread processors * 5 SP + ((amd_gcn(device_info) || amd_vliw4(device_info)) ? 4 : 5); + } + else if (gpu(device_info)) //Any other GPU + core_count *=8; + + return core_count; +} + +cl_uint get_processor_family(int dev_id) +{ + char dname[MAX_OCLINFO_STRING_LEN]; + + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_NAME, + sizeof(dname), dname, NULL), + "Error querying CL_DEVICE_NAME"); + + if gpu(device_info) { + + if (gpu_amd(device_info) && ( + strstr(dname, "Cedar") || + strstr(dname, "Redwood") || + strstr(dname, "Juniper") || + strstr(dname, "Cypress") || + strstr(dname, "Hemlock") || + strstr(dname, "Caicos") || + strstr(dname, "Turks") || + strstr(dname, "Barts") || + strstr(dname, "Cayman") || + strstr(dname, "Antilles") || + strstr(dname, "Wrestler") || + strstr(dname, "Zacate") || + strstr(dname, "WinterPark") || + strstr(dname, "BeaverCreek"))) { + + if (strstr(dname, "Cayman") || + strstr(dname, "Antilles")) + return AMD_VLIW4; + else + return AMD_VLIW5; + + } else + return AMD_GCN + AMD_VLIW5; + } + return UNKNOWN; +} + +int get_vendor_id(int dev_id) +{ + char dname[MAX_OCLINFO_STRING_LEN]; + + HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_VENDOR, + sizeof(dname), dname, NULL), + "Error querying CL_DEVICE_VENDOR"); + + if (strstr (dname, "NVIDIA") != NULL) + return NVIDIA; + + if (strstr (dname, "Advanced Micro") !=NULL || + strstr (dname, "AMD") !=NULL || + strstr (dname, "ATI") != NULL) + return AMD; + + return UNKNOWN; +} + char *get_error_name(cl_int cl_error) { static char *err_1[] = @@ -264,7 +397,6 @@ char *megastring(unsigned long long value) return outbuf; } -#define MAX_OCLINFO_STRING_LEN 64 void listOpenCLdevices(void) { char dname[MAX_OCLINFO_STRING_LEN]; cl_uint num_platforms, num_devices, entries; @@ -334,7 +466,10 @@ void listOpenCLdevices(void) { clGetDeviceInfo(devices[d], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &p_size, NULL); printf("\tMax Work Group Size:\t%d\n", (int)p_size); clGetDeviceInfo(devices[d], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &entries, NULL); - printf("\tParallel compute cores:\t%d\n\n", entries); + printf("\tParallel compute cores:\t%d\n", entries); + + opencl_get_dev_info(d); + printf("\tStream processors:\t%d\n\n", get_processors_count(d)); } } return; diff --git a/src/common-opencl.h b/src/common-opencl.h index 81b79b1..3ca7ec8 100644 --- a/src/common-opencl.h +++ b/src/common-opencl.h @@ -17,6 +17,7 @@ #define MAXGPUS 8 #define MAX_PLATFORMS 8 #define SUBSECTION_OPENCL ":OpenCL" +#define MAX_OCLINFO_STRING_LEN 64 /* Comment if you do not want to see OpenCL warnings during kernel compilation */ #define REPORT_OPENCL_WARNINGS @@ -33,13 +34,44 @@ cl_kernel crypt_kernel; size_t local_work_size; size_t max_group_size; +cl_int oclGetDevCap(cl_device_id device, cl_int *iComputeCapMajor, cl_int *iComputeCapMinor); + +void opencl_init_dev(unsigned int dev_id, unsigned int platform_id); void opencl_init(char *kernel_filename, unsigned int dev_id, unsigned int platform_id); +void opencl_build_kernel(char *kernel_filename, unsigned int dev_id); +int get_device_info(); +cl_device_type get_device_type(int dev_id); cl_ulong get_local_memory_size(int dev_id); size_t get_max_work_group_size(int dev_id); +size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel); cl_uint get_max_compute_units(int dev_id); -cl_device_type get_device_type(int dev_id); +cl_uint get_processors_count(int dev_id); +cl_uint get_processor_family(int dev_id); +int get_vendor_id(int dev_id); + +#define UNKNOWN 0 +#define CPU 1 +#define GPU 2 +#define ACCELERATOR 4 +#define AMD 64 +#define NVIDIA 128 +#define INTEL 256 +#define AMD_GCN 1024 +#define AMD_VLIW4 2048 +#define AMD_VLIW5 4096 + +#define cpu(n) ((n & CPU) == (CPU)) +#define gpu(n) ((n & GPU) == (GPU)) +#define gpu_amd(n) ((n & AMD) && gpu(n)) +#define gpu_amd_64(n) (0) +#define gpu_nvidia(n) ((n & NVIDIA) && gpu(n)) +#define gpu_intel(n) ((n & INTEL) && gpu(n)) +#define cpu_amd(n) ((n & AMD) && cpu(n)) +#define amd_gcn(n) ((n & AMD_GCN) && gpu_amd(n)) +#define amd_vliw4(n) ((n & AMD_VLIW4) && gpu_amd(n)) +#define amd_vliw5(n) ((n & AMD_VLIW5) && gpu_amd(n)) char *get_error_name(cl_int cl_error); diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl index be0ddcd..012b43e 100644 --- a/src/opencl/cryptsha512_kernel.cl +++ b/src/opencl/cryptsha512_kernel.cl @@ -2,6 +2,8 @@ * Developed by Claudio André in 2012 * Based on source code provided by Lukas Odzioba * + * More information at http://openwall.info/wiki/john/OpenCL-SHA-512 + * * This software is: * Copyright (c) 2011 Lukas Odzioba * Copyright (c) 2012 Claudio André @@ -125,16 +127,10 @@ void sha512_block(__local sha512_ctx * ctx) { uint64_t t1, t2; uint64_t w[16]; -#ifdef DEVICE_IS_CPU - #pragma unroll 16 - for (int i = 0; i < 16; i++) - w[i] = SWAP64(ctx->buffer->mem_64[i]); -#else ulong16 w_vector; w_vector = vload16(0, ctx->buffer->mem_64); w_vector = SWAP64(w_vector); vstore16(w_vector, 0, w); -#endif #pragma unroll 80 for (int i = 0; i < 80; i++) { @@ -165,13 +161,24 @@ void sha512_block(__local sha512_ctx * ctx) { } void ctx_append_1(__local sha512_ctx * ctx) { - int i = 127 - ctx->buflen; - __local uint8_t * d = ctx->buffer->mem_08 + ctx->buflen; + + int length = ctx->buflen; + int i = 127 - length; + __local uint8_t * d = ctx->buffer->mem_08 + length; + __local uint32_t * l; *d++ = 0x80; - while (i--) { - d[i] = 0; + while((++length % 4) != 0) + { + *d++ = 0; + i--; + } + l = (__local uint32_t*) d; + + while (i > 0) { + i-= 4; + *l++ = 0; } } @@ -204,10 +211,10 @@ void ctx_update(__local sha512_ctx * ctx, void clear_ctx_buffer(__local sha512_ctx * ctx) { - __local uint32_t *w = ctx->buffer->mem_32; + __local uint64_t *w = ctx->buffer->mem_64; - #pragma unroll 32 - for (int i = 0; i < 32; i++) + //#pragma unroll 16 + for (int i = 0; i < 16; i++) w[i] = 0; ctx->buflen = 0; @@ -302,7 +309,7 @@ void sha512crypt(__local working_memory * fast_tmp_memory, ctx_update(&ctx, p_sequence->mem_08, passlen); ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence->mem_08), - ((i & 1) != 0 ? 64 : passlen)); + ((i & 1) != 0 ? 64 : passlen)); sha512_digest(&ctx, alt_result->mem_64); } //Send results to the host. @@ -315,11 +322,14 @@ void sha512crypt(__local working_memory * fast_tmp_memory, #undef rounds #undef pass -__kernel void kernel_crypt(__constant crypt_sha512_salt * informed_salt, - __global crypt_sha512_password * pass_data, - __global crypt_sha512_hash * out_buffer, - __local crypt_sha512_salt * salt_data, - __local working_memory * fast_tmp_memory) { +__kernel +// __attribute__((vec_type_hint(ulong2))) Not recognized. +// __attribute__((reqd_work_group_size(32, 1, 1))) No gain. +void kernel_crypt(__constant crypt_sha512_salt * informed_salt, + __global crypt_sha512_password * pass_data, + __global crypt_sha512_hash * out_buffer, + __local crypt_sha512_salt * salt_data, + __local working_memory * fast_tmp_memory) { //Get the task to be done size_t gid = get_global_id(0); @@ -365,6 +375,7 @@ __kernel void kernel_crypt(__constant crypt_sha512_salt * informed_salt, * 5% Remove some unecessary code. * ### Move almost everything to global and local memory. BAD. * 1% Use vector types in SHA_Block in some variables. +* 5% Use bitselect in SHA_Block. * * Conclusions * - Compare on GPU: CPU is more efficient for now. @@ -376,4 +387,4 @@ __kernel void kernel_crypt(__constant crypt_sha512_salt * informed_salt, * so should not expect big gains anyway. * If i have a lot of memory, i might solve more than one hash at once * (and use more vectors). But it is not possible (at least for a while). -***/ \ No newline at end of file +***/ diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h index 7799c0e..2409bbf 100644 --- a/src/opencl_cryptsha512.h +++ b/src/opencl_cryptsha512.h @@ -2,6 +2,8 @@ * Developed by Claudio André in 2012 * Based on source code provided by Lukas Odzioba * + * More information at http://openwall.info/wiki/john/OpenCL-SHA-512 + * * This software is: * Copyright (c) 2011 Lukas Odzioba * Copyright (c) 2012 Claudio André @@ -14,6 +16,29 @@ #ifndef _CRYPTSHA512_H #define _CRYPTSHA512_H +//Copied from common-opencl.h +#define UNKNOWN 0 +#define CPU 1 +#define GPU 2 +#define ACCELERATOR 4 +#define AMD 64 +#define NVIDIA 128 +#define INTEL 256 +#define AMD_GCN 1024 +#define AMD_VLIW4 2048 +#define AMD_VLIW5 4096 + +#define cpu(n) ((n & CPU) == (CPU)) +#define gpu(n) ((n & GPU) == (GPU)) +#define gpu_amd(n) ((n & AMD) && gpu(n)) +#define gpu_amd_64(n) (0) +#define gpu_nvidia(n) ((n & NVIDIA) && gpu(n)) +#define gpu_intel(n) ((n & INTEL) && gpu(n)) +#define cpu_amd(n) ((n & AMD) && cpu(n)) +#define amd_gcn(n) ((n & AMD_GCN) && gpu_amd(n)) +#define amd_vliw4(n) ((n & AMD_VLIW4) && gpu_amd(n)) +#define amd_vliw5(n) ((n & AMD_VLIW5) && gpu_amd(n)) + //Type names definition. #define uint8_t unsigned char #define uint16_t unsigned short @@ -24,6 +49,7 @@ #define MAX(x,y) ((x) > (y) ? (x) : (y)) #define MIN(x,y) ((x) < (y) ? (x) : (y)) +//Constants. #define ROUNDS_DEFAULT 5000 #define ROUNDS_MIN 1000 #define ROUNDS_MAX 999999999 @@ -31,16 +57,31 @@ #define SALT_SIZE 16 #define PLAINTEXT_LENGTH 16 #define BINARY_SIZE (3+16+86) ///TODO: Magic number? +#define STEP 512 -#define KEYS_PER_CORE_CPU 512 -#define KEYS_PER_CORE_GPU 1024 +#define KEYS_PER_CORE_CPU 128 +#define KEYS_PER_CORE_GPU 512 #define MIN_KEYS_PER_CRYPT 128 -#define MAX_KEYS_PER_CRYPT 2048*2048*128 +#define MAX_KEYS_PER_CRYPT 2048*1024 + +//Macros. +#if gpu_amd_64(DEVICE_INFO) + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define ror(x, n) amd_bitalign(x, x, (uint64_t) n) + #define Ch(x, y, z) amd_bytealign(x, y, z) + #define Maj(x, y, z) amd_bytealign(z ^ x, y, x ) +#elif gpu_amd(DEVICE_INFO) + #define Ch(x,y,z) bitselect(z, y, x) + #define Maj(x,y,z) bitselect(x, y, z ^ x) + #define ror(x, n) rotate(x, (uint64_t) 64-n) +#elif gpu_nvidia(DEVICE_INFO) + #pragma OPENCL EXTENSION cl_nv_pragma_unroll : enable +#else + #define Ch(x,y,z) ((x & y) ^ ( (~x) & z)) + #define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) + #define ror(x, n) ((x >> n) | (x << (64-n))) +#endif -#define rol(x,n) rotate(x, n) -#define ror(x,n) rotate(x, (uint64_t) 64-n) -#define Ch(x,y,z) ((x & y) ^ ( (~x) & z)) -#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) #define Sigma0(x) ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39))) #define Sigma1(x) ((ror(x,14)) ^ (ror(x,18)) ^ (ror(x,41))) #define sigma0(x) ((ror(x,1)) ^ (ror(x,8)) ^ (x>>7)) @@ -93,4 +134,4 @@ typedef struct { buffer_64 temp_result[8]; buffer_64 p_sequence[8]; } working_memory; -#endif \ No newline at end of file +#endif diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c index e08e08d..f10e431 100644 --- a/src/opencl_cryptsha512_fmt.c +++ b/src/opencl_cryptsha512_fmt.c @@ -2,6 +2,8 @@ * Developed by Claudio André in 2012 * Based on source code provided by Samuele Giovanni Tonon * + * More information at http://openwall.info/wiki/john/OpenCL-SHA-512 + * * Copyright (c) 2011 Samuele Giovanni Tonon * Copyright (c) 2012 Claudio André * This program comes with ABSOLUTELY NO WARRANTY; express or implied . @@ -11,7 +13,7 @@ */ #include -#include "common-opencl.h" +#include "common-opencl.h" #include "config.h" #include "opencl_cryptsha512.h" @@ -50,26 +52,14 @@ static struct fmt_tests tests[] = { {NULL} }; -size_t get_current_work_group_size(int dev_id) -{ - size_t max_group_size; - - HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id], - CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), - &max_group_size, NULL), - "Error querying clGetKernelWorkGroupInfo"); - - return max_group_size; -} - /* ------- Helper functions ------- */ unsigned int get_task_max_work_group_size(){ unsigned int max_available; max_available = get_local_memory_size(gpu_id) / (sizeof(working_memory) + sizeof(crypt_sha512_salt)); - if (max_available > get_current_work_group_size(gpu_id)) - return get_current_work_group_size(gpu_id); + if (max_available > get_current_work_group_size(gpu_id, crypt_kernel)) + return get_current_work_group_size(gpu_id, crypt_kernel); return max_available; } @@ -77,8 +67,8 @@ unsigned int get_task_max_work_group_size(){ unsigned int get_task_max_size(){ unsigned int max_available; max_available = get_max_compute_units(gpu_id); - - if (get_device_type(gpu_id) == CL_DEVICE_TYPE_CPU) + + if (cpu(get_device_info())) return max_available * KEYS_PER_CORE_CPU; return max_available * KEYS_PER_CORE_GPU; @@ -165,6 +155,50 @@ static void release_clobj(void) { HANDLE_CLERROR(ret_code, "Error Releasing pinned_partial_hashes"); } +/* ------- Salt functions ------- */ +static void *get_salt(char *ciphertext) { + int end = 0, i, len = strlen(ciphertext); + for (i = len - 1; i >= 0; i--) + if (ciphertext[i] == '$') { + end = i; + break; + } + + static unsigned char ret[50]; + for (i = 0; i < end; i++) + ret[i] = ciphertext[i]; + ret[end] = 0; + return (void *) ret; +} + +static void set_salt(void *salt_info) { + int len = strlen(salt_info); + unsigned char offset = 0; + static char currentsalt[64]; + + memcpy(currentsalt, (char *) salt_info, len + 1); + salt.rounds = ROUNDS_DEFAULT; + + if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0) + offset += 3; + + if (strncmp((char *) currentsalt + offset, (char *) "rounds=", 7) == 0) { + const char *num = currentsalt + offset + 7; + char *endp; + unsigned long int srounds = strtoul(num, &endp, 10); + + if (*endp == '$') { + endp += 1; + salt.rounds = + MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX)); + } + offset = endp - currentsalt; + } + memcpy(salt.salt, currentsalt + offset, SALT_SIZE); + salt.length = strlen((char *) salt.salt); + salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length); +} + /* ------- Key functions ------- */ static void set_key(char *key, int index) { int len = strlen(key); @@ -192,7 +226,7 @@ static char *get_key(int index) { -- */ static void find_best_workgroup(void) { cl_event myEvent; - cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX; + cl_ulong startTime, endTime, min_time = CL_ULONG_MAX; size_t my_work_group = 1; cl_int ret_code; int i; @@ -204,41 +238,54 @@ static void find_best_workgroup(void) { HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue"); printf("Max Group Work Size %d ", (int) max_group_size); local_work_size = 1; - + max_group_size = get_task_max_work_group_size(); + + // Set salt. + set_salt("$6$saltstring$"); + // Set keys - for (i = 0; i < get_task_max_size(); i++) { + for (i = 0; i < max_keys_per_crypt; i++) { set_key("aaabaabaaa", i); } - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_TRUE, 0, + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_TRUE, 0, sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer I"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_TRUE, 0, - sizeof (crypt_sha512_password) * get_task_max_size(), + sizeof (crypt_sha512_password) * max_keys_per_crypt, plaintext, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer II"); - + + if (cpu(get_device_info())) + my_work_group = 1; + + else + my_work_group = 16; + // Find minimum time - for (my_work_group = 1; (int) my_work_group <= (int) get_task_max_work_group_size(); + for (; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) { + advance_cursor(); ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &max_keys_per_crypt, &my_work_group, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); if (ret_code != CL_SUCCESS) { - printf("Error %d\n", ret_code); ///Better commented by default. - break; + + if (ret_code != CL_INVALID_WORK_GROUP_SIZE) + printf("Error %d\n", ret_code); + continue; } //Get profile information - HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, + HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); - - if ((endTime - startTime) < kernelExecTimeNs) { - kernelExecTimeNs = endTime - startTime; + + if ((endTime - startTime) * 1.01 < min_time) { + min_time = endTime - startTime; local_work_size = my_work_group; } } @@ -257,19 +304,28 @@ static void find_best_workgroup(void) { static void find_best_kpc(void) { size_t num; cl_event myEvent; - cl_ulong startTime, endTime, tmpTime; - cl_ulong kernelExecTimeNs = CL_ULONG_MAX; + cl_ulong startTime, endTime, run_time, min_time = CL_ULONG_MAX; cl_int ret_code; - int optimal_kpc = MIN_KEYS_PER_CRYPT; - int i; cl_uint *tmpbuffer; + int optimal_kpc = MIN_KEYS_PER_CRYPT, i, step = STEP; + int do_benchmark = 0; + unsigned int SHAspeed, bestSHAspeed = 0; + char *tmp_value; printf("Calculating best keys per crypt, this will take a while "); + + if ((tmp_value = getenv("STEP"))){ + step = atoi(tmp_value); + do_benchmark = 1; + } - for (num = get_task_max_size(); (int) num > MIN_KEYS_PER_CRYPT; num -= 4096) { + for (num = step; num < MAX_KEYS_PER_CRYPT; num += step) { release_clobj(); create_clobj(num); - advance_cursor(); + + if (! do_benchmark) + advance_cursor(); + tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num); if (tmpbuffer == NULL) { @@ -280,12 +336,15 @@ static void find_best_kpc(void) { queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue"); - + + // Set salt. + set_salt("$6$saltstring$"); + // Set keys for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } - HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0, + HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0, sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL), "Failed in clEnqueueWriteBuffer I"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, @@ -309,19 +368,41 @@ static void find_best_kpc(void) { sizeof (cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); + free(tmpbuffer); + HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), + "Failed in clReleaseCommandQueue"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); - tmpTime = endTime - startTime; - if (((int) (((float) (tmpTime) / num) * 10)) <= kernelExecTimeNs) { - kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10)); + run_time = endTime - startTime; + SHAspeed = 5000 * num / (run_time / 1000000000.); + + if (run_time < min_time) + min_time = run_time; + + if (do_benchmark) { + fprintf(stderr, "kpc: %6zu\t%4lu c/s%14u rounds/s%8.3f sec per crypt_all()", + num, (long) (num / (run_time / 1000000000.)), SHAspeed, + (float) run_time / 1000000000.); + + if (run_time > 10000000000) { + fprintf(stderr, " - too slow\n"); + break; + } + } else { + if (run_time > min_time * 10) + break; + } + if (SHAspeed > (1.01 * bestSHAspeed)) { + if (do_benchmark) + fprintf(stderr, "+"); + bestSHAspeed = SHAspeed; optimal_kpc = num; } - free(tmpbuffer); - HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), - "Failed in clReleaseCommandQueue"); + if (do_benchmark) + fprintf(stderr, "\n"); } printf("Optimal keys per crypt %d\n", optimal_kpc); - printf("to avoid this test on next run, put \"" + printf("(to avoid this test on next run, put \"" KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS SUBSECTION_OPENCL "])\n", optimal_kpc); max_keys_per_crypt = optimal_kpc; @@ -332,10 +413,17 @@ static void find_best_kpc(void) { /* ------- Initialization ------- */ static void init(struct fmt_main *pFmt) { char *tmp_value; - opencl_init("$JOHN/cryptsha512_kernel.cl", gpu_id, platform_id); + opencl_init_dev(gpu_id, platform_id); + + if (cpu(get_device_info())) + opencl_build_kernel("$JOHN/cryptsha512_CPU_kernel.cl", gpu_id); + + else + opencl_build_kernel("$JOHN/cryptsha512_kernel.cl", gpu_id); + max_keys_per_crypt = get_task_max_size(); - local_work_size = 0; - + local_work_size = 32; //Default safe value. + // create kernel to execute crypt_kernel = clCreateKernel(program[gpu_id], "kernel_crypt", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); @@ -379,76 +467,32 @@ static void init(struct fmt_main *pFmt) { } printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n", (int) local_work_size, max_keys_per_crypt); - pFmt->params.max_keys_per_crypt = max_keys_per_crypt; + pFmt->params.max_keys_per_crypt = max_keys_per_crypt; } /* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */ static int valid(char *ciphertext, struct fmt_main *pFmt) { - uint32_t i, j; - int len = strlen(ciphertext); - - if (strncmp(ciphertext, "$6$", 3) != 0) - return 0; - char *p = strrchr(ciphertext, '$'); - if (p == NULL) - return 0; - for (i = p - ciphertext + 1; i < len; i++) { - int found = 0; - for (j = 0; j < 64; j++) - if (itoa64[j] == ARCH_INDEX(ciphertext[i])) - found = 1; - if (found == 0) { - puts("not found"); - return 0; - } - } - if (len - (p - ciphertext + 1) != 86) - return 0; - return 1; -} - -/* ------- Salt functions ------- */ -static void *get_salt(char *ciphertext) { - int end = 0, i, len = strlen(ciphertext); - for (i = len - 1; i >= 0; i--) - if (ciphertext[i] == '$') { - end = i; - break; - } - - static unsigned char ret[50]; - for (i = 0; i < end; i++) - ret[i] = ciphertext[i]; - ret[end] = 0; - return (void *) ret; -} + uint32_t i, j; + int len = strlen(ciphertext); -static void set_salt(void *salt_info) { - int len = strlen(salt_info); - unsigned char offset = 0; - static char currentsalt[64]; - - memcpy(currentsalt, (char *) salt_info, len + 1); - salt.rounds = ROUNDS_DEFAULT; - - if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0) - offset += 3; - - if (strncmp((char *) currentsalt + offset, (char *) "rounds=", 7) == 0) { - const char *num = currentsalt + offset + 7; - char *endp; - unsigned long int srounds = strtoul(num, &endp, 10); - - if (*endp == '$') { - endp += 1; - salt.rounds = - MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX)); - } - offset = endp - currentsalt; + if (strncmp(ciphertext, "$6$", 3) != 0) + return 0; + char *p = strrchr(ciphertext, '$'); + if (p == NULL) + return 0; + for (i = p - ciphertext + 1; i < len; i++) { + int found = 0; + for (j = 0; j < 64; j++) + if (itoa64[j] == ARCH_INDEX(ciphertext[i])) + found = 1; + if (found == 0) { + puts("not found"); + return 0; + } } - memcpy(salt.salt, currentsalt + offset, SALT_SIZE); - salt.length = strlen((char *) salt.salt); - salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length); + if (len - (p - ciphertext + 1) != 86) + return 0; + return 1; } /* ------- To binary functions ------- */ @@ -459,19 +503,19 @@ static int findb64(char c) { static void magic(char *crypt, unsigned char *alt) { #define _24bit_from_b64(I,B2,B1,B0) \ - {\ - unsigned char c1=findb64(crypt[I+0]);\ - unsigned char c2=findb64(crypt[I+1]);\ - unsigned char c3=findb64(crypt[I+2]);\ - unsigned char c4=findb64(crypt[I+3]);\ - unsigned int w=c4<<18|c3<<12|c2<<6|c1;\ - unsigned char b2=w&0xff;w>>=8;\ - unsigned char b1=w&0xff;w>>=8;\ - unsigned char b0=w&0xff;w>>=8;\ - alt[B2]=b0;\ - alt[B1]=b1;\ - alt[B0]=b2;\ - } + {\ + unsigned char c1=findb64(crypt[I+0]);\ + unsigned char c2=findb64(crypt[I+1]);\ + unsigned char c3=findb64(crypt[I+2]);\ + unsigned char c4=findb64(crypt[I+3]);\ + unsigned int w=c4<<18|c3<<12|c2<<6|c1;\ + unsigned char b2=w&0xff;w>>=8;\ + unsigned char b1=w&0xff;w>>=8;\ + unsigned char b0=w&0xff;w>>=8;\ + alt[B2]=b0;\ + alt[B1]=b1;\ + alt[B0]=b2;\ + } _24bit_from_b64(0, 0, 21, 42); _24bit_from_b64(4, 22, 43, 1); _24bit_from_b64(8, 44, 2, 23); @@ -557,7 +601,7 @@ static void crypt_all(int count) { //Do the work HANDLE_CLERROR(clFinish(queue[gpu_id]), "failed in clFinish"); - new_keys = 0; + new_keys = 0; } /* ------- Binary Hash functions group ------- */