diff --git a/src/common-opencl.c b/src/common-opencl.c index 8da586f..7f0440c 100644 --- a/src/common-opencl.c +++ b/src/common-opencl.c @@ -1069,6 +1069,7 @@ void opencl_build_from_binary(int sequential_id) fprintf(stderr, "Binary Build log: %s\n", opencl_log); } +#if 0 /* * NOTE: Requirements for using this function: * @@ -1278,6 +1279,7 @@ void opencl_find_best_workgroup_limit(struct fmt_main *self, profilingEvent = firstEvent = lastEvent = NULL; dyna_salt_remove(salt); } +#endif // Do the proper test using different global work sizes. static void clear_profiling_events() @@ -1480,12 +1482,9 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, benchEvent[i] = NULL; if (options.verbosity > 3) - fprintf(stderr, "Max local worksize "Zu", ", group_size_limit); + fprintf(stderr, "Calculating best local worksize (LWS)\n"); - /* Formats supporting vectorizing should have a default max keys per - crypt that is a multiple of 2 and of 3 */ - gws = global_work_size ? global_work_size : - self->params.max_keys_per_crypt / opencl_v_width; + gws = global_work_size; if (get_device_version(sequential_id) < 110) { if (get_device_type(sequential_id) == CL_DEVICE_TYPE_GPU) @@ -1584,8 +1583,13 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, (int)my_work_group <= (int)max_group_size; my_work_group += wg_multiple) { + global_work_size = gws; if (gws % my_work_group != 0) - continue; + global_work_size = GET_EXACT_MULTIPLE(gws, my_work_group); + + if (options.verbosity > 3) + fprintf(stderr, "Testing GWS=" Zu " LWS=" Zu " ...", + global_work_size, my_work_group); sumStartTime = 0; sumEndTime = 0; @@ -1603,7 +1607,7 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, startTime = endTime = 0; if (options.verbosity > 3) - fprintf(stderr, " Error occurred\n"); + fprintf(stderr, " crypt_all() error\n"); break; } @@ -1626,9 +1630,25 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, } if (!endTime) break; - if ((sumEndTime - sumStartTime) < kernelExecTimeNs) { + if (options.verbosity > 3) + fprintf(stderr, " " Zu "ns\n", sumEndTime - sumStartTime); + if ((double)(sumEndTime - sumStartTime) / kernelExecTimeNs < 0.997) { kernelExecTimeNs = sumEndTime - sumStartTime; optimal_work_group = my_work_group; + } else { + if (my_work_group >= 256 || + (my_work_group >= 8 && wg_multiple < 8)) { + /* Jump to next power of 2 */ + size_t x, y; + x = my_work_group; + while ((y = x & (x - 1))) + x = y; + x *= 2; + my_work_group = + GET_MULTIPLE_OR_BIGGER(x, wg_multiple); + /* The loop logic will re-add wg_multiple */ + my_work_group -= wg_multiple; + } } } // Release profiling queue and create new with profiling disabled @@ -1639,17 +1659,28 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, devices[sequential_id], 0, &ret_code); HANDLE_CLERROR(ret_code, "Error creating command queue"); local_work_size = optimal_work_group; + global_work_size = GET_EXACT_MULTIPLE(gws, local_work_size); dyna_salt_remove(salt); } void opencl_find_best_gws(int step, unsigned long long int max_run_time, - int sequential_id, unsigned int rounds) + int sequential_id, unsigned int rounds, int have_lws) { size_t num = 0; - size_t optimal_gws = local_work_size; + size_t optimal_gws = local_work_size, soft_limit = 0; unsigned long long speed, best_speed = 0, raw_speed; cl_ulong run_time, min_time = CL_ULONG_MAX; + unsigned long long int save_duration_time = duration_time; + cl_uint core_count = get_max_compute_units(sequential_id); + + if (have_lws) { + if (core_count > 2) + optimal_gws *= core_count; + default_value = optimal_gws; + } else { + soft_limit = local_work_size * core_count * 128; + } /* * max_run_time is either: @@ -1692,8 +1723,12 @@ void opencl_find_best_gws(int step, unsigned long long int max_run_time, // Check if hardware can handle the size we are going // to try now. - if ((gws_limit && (num > gws_limit)) || ((gws_limit == 0) && - (buffer_size * kpc * 1.1 > get_max_mem_alloc_size(gpu_id)))) { + if ((soft_limit && (num > soft_limit)) || + (gws_limit && (num > gws_limit)) || ((gws_limit == 0) && + (buffer_size * kpc * 1.1 > get_max_mem_alloc_size(gpu_id)))) { + if (!optimal_gws) + optimal_gws = num; + if (options.verbosity > 4) fprintf(stderr, "Hardware resources exhausted\n"); break; @@ -1743,6 +1778,8 @@ void opencl_find_best_gws(int step, unsigned long long int max_run_time, devices[sequential_id], 0, &ret_code); HANDLE_CLERROR(ret_code, "Error creating command queue"); global_work_size = optimal_gws; + + duration_time = save_duration_time; } static void opencl_get_dev_info(int sequential_id) diff --git a/src/common-opencl.h b/src/common-opencl.h index 66969d5..1e76b1d 100644 --- a/src/common-opencl.h +++ b/src/common-opencl.h @@ -270,7 +270,7 @@ void opencl_find_best_lws(size_t group_size_limit, int sequential_id, * For raw formats it should be 1. For sha512crypt it is 5000. */ void opencl_find_best_gws(int step, unsigned long long int max_run_time, - int sequential_id, unsigned int rounds); + int sequential_id, unsigned int rounds, int have_lws); /* * Shared function to initialize variables necessary by shared find(lws/gws) functions. diff --git a/src/opencl-autotune.h b/src/opencl-autotune.h index 6cfa22e..2f3ee95 100644 --- a/src/opencl-autotune.h +++ b/src/opencl-autotune.h @@ -49,7 +49,7 @@ size_t autotune_get_task_max_work_group_size(int use_local_memory, of keys per crypt for the given format -- */ void autotune_find_best_gws(int sequential_id, unsigned int rounds, int step, - unsigned long long int max_run_time); + unsigned long long int max_run_time, int have_lws); /* -- This function could be used to calculated the best local @@ -78,11 +78,11 @@ static void find_best_lws(struct fmt_main * self, int sequential_id) of keys per crypt for the given format -- */ static void find_best_gws(struct fmt_main * self, int sequential_id, unsigned int rounds, - unsigned long long int max_run_time) + unsigned long long int max_run_time, int have_lws) { //Call the common function. autotune_find_best_gws( - sequential_id, rounds, STEP, max_run_time + sequential_id, rounds, STEP, max_run_time, have_lws ); create_clobj(global_work_size, self); @@ -108,13 +108,16 @@ static void find_best_gws(struct fmt_main * self, int sequential_id, unsigned in static void autotune_run_extra(struct fmt_main * self, unsigned int rounds, size_t gws_limit, unsigned long long int max_run_time, cl_uint lws_is_power_of_two) { + int need_best_lws, need_best_gws; + /* Read LWS/GWS prefs from config or environment */ opencl_get_user_preferences(FORMAT_LABEL); if (!global_work_size && !getenv("GWS")) global_work_size = get_task_max_size(); - if (!local_work_size && !getenv("LWS")) + need_best_lws = !local_work_size && !getenv("LWS"); + if (need_best_lws) local_work_size = get_default_workgroup(); if (gws_limit && (global_work_size > gws_limit)) @@ -134,14 +137,27 @@ static void autotune_run_extra(struct fmt_main * self, unsigned int rounds, local_work_size = get_task_max_work_group_size(); /* Enumerate GWS using *LWS=NULL (unless it was set explicitly) */ - if (!global_work_size) - find_best_gws(self, gpu_id, rounds, max_run_time); - else + need_best_gws = !global_work_size; + if (need_best_gws) { + unsigned long long int max_run_time1; + int have_lws = !(!local_work_size || need_best_lws); + if (have_lws) { + max_run_time1 = max_run_time; + need_best_gws = 0; + } else { + max_run_time1 = (max_run_time + 1) / 2; + } + find_best_gws(self, gpu_id, rounds, max_run_time1, have_lws); + } else { create_clobj(global_work_size, self); + } - if (!local_work_size) + if (!local_work_size || need_best_lws) find_best_lws(self, gpu_id); + if (need_best_gws) + find_best_gws(self, gpu_id, rounds, max_run_time, 1); + /* Adjust to the final configuration */ release_clobj(); global_work_size = GET_EXACT_MULTIPLE(global_work_size, local_work_size); diff --git a/src/opencl_autotune.c b/src/opencl_autotune.c index f42303b..c03c21a 100644 --- a/src/opencl_autotune.c +++ b/src/opencl_autotune.c @@ -83,7 +83,7 @@ void autotune_find_best_lws(size_t group_size_limit, of keys per crypt for the given format -- */ void autotune_find_best_gws(int sequential_id, unsigned int rounds, int step, - unsigned long long int max_run_time) + unsigned long long int max_run_time, int have_lws) { char *tmp_value; @@ -93,7 +93,7 @@ void autotune_find_best_gws(int sequential_id, unsigned int rounds, int step, step = GET_MULTIPLE_OR_ZERO(step, local_work_size); //Call the default function. - opencl_find_best_gws(step, max_run_time, sequential_id, rounds); + opencl_find_best_gws(step, max_run_time, sequential_id, rounds, have_lws); } #endif