Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Wed, 01 Feb 2012 15:43:09 +0100
From: Samuele Giovanni Tonon <samu@...uxasylum.net>
To: john-dev@...ts.openwall.com
Subject: strange behaviour of c/s on raw-md5 and raw-md5-opencl

hello,
i report a strange behaviour on raw-md5 CPU/GPU version.
i've been working on making raw-md5 a bit faster and i succeded
a bit.

../run/john -test -fo=raw-md5-opencl
OpenCL Platforms: 1
OpenCL Platform: <<<NVIDIA CUDA>>> 1 device(s), using device: <<<GeForce
GT 525M>>>
Max Group Work Size 1024 Optimal Group work Size = 512
LWS 512, KpC 2097152
Benchmarking: Raw MD5 [raw-md5-opencl]... DONE
Raw:    49344K c/s real, 24916K c/s virtual

../run/john -test -fo=raw-md5
Benchmarking: Raw MD5 [SSE2i 10x4x3]... DONE
Raw:    24302K c/s real, 24302K c/s virtual

however i'm noticing a strange behaviour on the C/S report:

../run/john -i:digits -fo=raw-md5-opencl /tmp/pppp
OpenCL Platforms: 1
OpenCL Platform: <<<NVIDIA CUDA>>> 1 device(s), using device: <<<GeForce
GT 525M>>>
Max Group Work Size 1024 Optimal Group work Size = 512
LWS 512, KpC 2097152
Loaded 10000 password hashes with no different salts (Raw MD5
[raw-md5-opencl])
guesses: 0  time: 0:00:00:05 DONE (Wed Feb  1 15:07:55 2012)  c/s:
40115M  trying: 82517659 - 83536784

../run/john -i:digits -fo=raw-md5 /tmp/pppp
Loaded 10000 password hashes with no different salts (Raw MD5 [SSE2i
10x4x3])
guesses: 0  time: 0:00:00:06 DONE (Wed Feb  1 15:08:07 2012)  c/s:
185185M  trying: 83536056 - 83536784

this can be seen also using an incremental alpha7 defined as follow :
[Incremental:Alpha7]
File = $JOHN/alpha.chr
MinLen = 1
MaxLen = 7
CharCount = 26



../run/john -i:alpha7 -fo=raw-md5 /tmp/pppp
Loaded 10000 password hashes with no different salts (Raw MD5 [SSE2i
10x4x3])
guesses: 0  time: 0:00:07:13 DONE (Wed Feb  1 15:15:58 2012)  c/s:
192911M  trying: qqjxwsq - qqjxjxq


../run/john -i:alpha7 -fo=raw-md5-opencl /tmp/pppp
OpenCL Platforms: 1
OpenCL Platform: <<<NVIDIA CUDA>>> 1 device(s), using device: <<<GeForce
GT 525M>>>
Max Group Work Size 1024 Optimal Group work Size = 512
LWS 512, KpC 2097152
Loaded 10000 password hashes with no different salts (Raw MD5
[raw-md5-opencl])
guesses: 0  time: 0:00:05:12 DONE (Wed Feb  1 15:21:51 2012)  c/s:
48408M  trying: qqhjueq - qqjxjxq

same password file, same incremental mode on CPU it last for 7:13 at
at 192911M c/s , on GPU it last for 5:12 at 48408 M c/s ? should'nt
the c/s be higher ?

this has been tested on last github and on john-1.7.9-jumbo5
i'm including source code so anyone can test and confirm it or not.

Cheers
Samuele


/*
 * This file is part of John the Ripper password cracker,
 * Copyright (c) 2010 by Solar Designer
 *
 * MD5 OpenCL code is based on Alain Espinosa's OpenCL patches.
 * 
 */

#include <string.h>

#include "arch.h"
#include "params.h"
#include "path.h"
#include "common.h"
#include "formats.h"
#include "common-opencl.h"

#define MD5
#include "opencl-tweaks.h"

#define FORMAT_LABEL        "raw-md5-opencl"
#define FORMAT_NAME         "Raw MD5"
#define ALGORITHM_NAME      "raw-md5-opencl"
#define BENCHMARK_COMMENT   ""
#define BENCHMARK_LENGTH    -1
#define CIPHERTEXT_LENGTH   32
#define BINARY_SIZE         16
#define SALT_SIZE           0

cl_command_queue queue_prof;
cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static char get_key_saved[PLAINTEXT_LENGTH + 1];

#define MIN_KEYS_PER_CRYPT      MD5_NUM_KEYS
#define MAX_KEYS_PER_CRYPT      MD5_NUM_KEYS
static size_t global_work_size = MD5_NUM_KEYS;
static unsigned int datai[2];
static int have_full_hashes;

static int max_keys_per_crypt = MD5_NUM_KEYS;

static struct fmt_tests tests[] = {
	{"098f6bcd4621d373cade4e832627b4f6", "test"},
	{"d41d8cd98f00b204e9800998ecf8427e", ""},
	{NULL}
};

static void find_best_workgroup(void){
	cl_event myEvent;
	cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
	size_t my_work_group = 1;
	cl_int ret_code;
	int i = 0;
	size_t max_group_size;

	clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_WORK_GROUP_SIZE,
	    sizeof(max_group_size), &max_group_size, NULL);
	queue_prof =
	    clCreateCommandQueue(context[gpu_id], devices[gpu_id],
	    CL_QUEUE_PROFILING_ENABLE, &ret_code);
	printf("Max Group Work Size %d ", (int) max_group_size);
	local_work_size = 1;

	// Set keys
	for (; i < MD5_NUM_KEYS; i++) {
		memcpy(&(saved_plain[i * (PLAINTEXT_LENGTH + 1)]), "aaaaaaaa",
		    PLAINTEXT_LENGTH + 1);
		saved_plain[i * (PLAINTEXT_LENGTH + 1) + 8] = 0x80;
	}
	clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0,
	    (PLAINTEXT_LENGTH + 1) * MD5_NUM_KEYS, saved_plain, 0, NULL, NULL);

	// Find minimum time
	for (my_work_group = 1; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) {
		ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL,
		    &global_work_size, &my_work_group, 0, NULL, &myEvent);
		if (ret_code != CL_SUCCESS) {
			printf("Errore %d\n", ret_code);
			continue;
		}
		clFinish(queue_prof);

		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
		    sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
		    sizeof(cl_ulong), &endTime, NULL);

		if ((endTime - startTime) < kernelExecTimeNs) {
			kernelExecTimeNs = endTime - startTime;
			local_work_size = my_work_group;
		}
	}
	printf("Optimal Group work Size = %d\n", (int) local_work_size);
	clReleaseCommandQueue(queue_prof);
}

static void create_clobj(int kpc){

	pinned_saved_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
	    (PLAINTEXT_LENGTH + 1) * kpc, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_saved_keys");

	saved_plain = (char *) clEnqueueMapBuffer(queue[gpu_id], pinned_saved_keys,
	    CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0,
	    (PLAINTEXT_LENGTH + 1) * kpc, 0, NULL, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_plain");

    	res_hashes = malloc(sizeof(cl_uint) * 3 * kpc);
	
	pinned_partial_hashes = clCreateBuffer(context[gpu_id],
	    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 4 * kpc, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes");

	partial_hashes = (cl_uint *) clEnqueueMapBuffer(queue[gpu_id],
	    pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, 4 * kpc, 0, NULL, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error mapping page-locked memory partial_hashes");

	// create and set arguments
	buffer_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY,
	    (PLAINTEXT_LENGTH + 1) * kpc, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys");

	buffer_out = clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY,
	    BINARY_SIZE * kpc, NULL, &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out");

	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(buffer_keys),
		(void *) &buffer_keys), "Error setting argument 1");
	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(buffer_out),
		(void *) &buffer_out), "Error setting argument 2");

	datai[0] = PLAINTEXT_LENGTH;
	datai[1] = kpc;
	global_work_size = kpc;
}

static void release_clobj(void){
	cl_int ret_code;

	ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_partial_hashes, partial_hashes, 0,NULL,NULL);
	HANDLE_CLERROR(ret_code, "Error Ummapping partial_hashes");
	ret_code = clEnqueueUnmapMemObject(queue[gpu_id], pinned_saved_keys, saved_plain, 0, NULL, NULL);
	HANDLE_CLERROR(ret_code, "Error Ummapping saved_plain");
	ret_code = clReleaseMemObject(buffer_keys);
	HANDLE_CLERROR(ret_code, "Error Releasing buffer_keys");
	ret_code = clReleaseMemObject(buffer_out);
	HANDLE_CLERROR(ret_code, "Error Releasing buffer_out");
	//ret_code = clReleaseMemObject(data_info);
	//HANDLE_CLERROR(ret_code, "Error Releasing data_info");
	ret_code = clReleaseMemObject(pinned_saved_keys);
	HANDLE_CLERROR(ret_code, "Error Releasing pinned_saved_keys");
	ret_code = clReleaseMemObject(pinned_partial_hashes);
	HANDLE_CLERROR(ret_code, "Error Releasing pinned_partial_hashes");
	free(res_hashes);
	//free(saved_key);
}

/* 
   laming spinning cursor 
   added because it's cute
*/
static void advance_cursor() {
	static int pos=0;
	char cursor[4]={'/','-','\\','|'};
	printf("%c\b", cursor[pos]);
	fflush(stdout);
	pos = (pos+1) % 4;
}

static void find_best_kpc(void){
	int num;
	cl_event myEvent;
	cl_ulong startTime, endTime, tmpTime;
	int kernelExecTimeNs = 6969;
	cl_int ret_code;
	int optimal_kpc=2048;
	int i = 0;
	cl_uint *tmpbuffer;

	printf("Calculating best keys per crypt, this will take a while ");
	for( num=MD5_NUM_KEYS; num > 4096 ; num -= 2048){
		release_clobj();
		printf("clobj %d\n",num);
		create_clobj(num);
		advance_cursor();
		queue_prof = clCreateCommandQueue( context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
		for (i=0; i < num; i++){
			memcpy(&(saved_plain[i * (PLAINTEXT_LENGTH + 1)]), "abcaaeaf", PLAINTEXT_LENGTH + 1);
			saved_plain[i * (PLAINTEXT_LENGTH + 1) + 8] = 0x80;
		}
        //clEnqueueWriteBuffer(queue_prof, data_info, CL_TRUE, 0, sizeof(unsigned int)*2, datai, 0, NULL, NULL);
		clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH + 1) * num, saved_plain, 0, NULL, NULL);
    	ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
		if(ret_code != CL_SUCCESS){
			printf("Error %d\n",ret_code);
			continue;
		}
		clFinish(queue_prof);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = endTime-startTime;
		tmpbuffer = malloc(sizeof(cl_uint) * 5 * num);
		clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * 4 * num, tmpbuffer, 0, NULL, &myEvent);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = tmpTime + (endTime-startTime);
		if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) {
			kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ;
			optimal_kpc = num;
		}
		free(tmpbuffer);
		clReleaseCommandQueue(queue_prof);
	}
	printf("Optimal keys per crypt %d\n(to avoid this test on next run do export KPC=%d)\n",optimal_kpc,optimal_kpc);
	max_keys_per_crypt = optimal_kpc;
	release_clobj();
	create_clobj(optimal_kpc);
}

static void fmt_MD5_init(struct fmt_main *pFmt) {
	char *kpc;

	opencl_init("$JOHN/md5_opencl_kernel.cl", gpu_id);
	crypt_kernel = clCreateKernel(program[gpu_id], "md5", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
	if( (kpc = getenv("LWS")) == NULL){
		create_clobj(MD5_NUM_KEYS);
		find_best_workgroup();
		release_clobj();
	}else {
		local_work_size = atoi(kpc);
	}
	if( (kpc = getenv("KPC")) == NULL){
		create_clobj(MD5_NUM_KEYS);
		max_keys_per_crypt = MD5_NUM_KEYS;
		//don't use it yet, we go with MD5_NUM_KEYS default
		//find_best_kpc();
	} else {
		create_clobj(max_keys_per_crypt);
		max_keys_per_crypt = atoi(kpc);
	}
	printf("LWS %d, KpC %d\n",(int)local_work_size,max_keys_per_crypt);
	pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
}

static int valid(char *ciphertext, struct fmt_main *pFmt) {
	char *p, *q;
	p = ciphertext;
	if (!strncmp(p, "$MD5$", 5))
		p += 5;
	q = p;
	while (atoi16[ARCH_INDEX(*q)] != 0x7F)
		q++;
	return !*q && q - p == CIPHERTEXT_LENGTH;
}

static char *split(char *ciphertext, int index) {
	static char out[5 + CIPHERTEXT_LENGTH + 1];

	if (!strncmp(ciphertext, "$MD5$", 5))
		return ciphertext;

	memcpy(out, "$MD5$", 5);
	memcpy(out + 5, ciphertext, CIPHERTEXT_LENGTH + 1);
	return out;
}

static void *get_binary(char *ciphertext) {
	static unsigned char out[BINARY_SIZE];
	char *p;
	int i;
	p = ciphertext + 5;
	for (i = 0; i < sizeof(out); i++) {
		out[i] = (atoi16[ARCH_INDEX(*p)] << 4) | atoi16[ARCH_INDEX(p[1])];
		p += 2;
	}
	return out;
}
static int binary_hash_0(void *binary) { return *(ARCH_WORD_32 *) binary & 0xF; }
static int binary_hash_1(void *binary) { return *(ARCH_WORD_32 *) binary & 0xFF; }
static int binary_hash_2(void *binary) { return *(ARCH_WORD_32 *) binary & 0xFFF; }
static int binary_hash_3(void *binary) { return *(ARCH_WORD_32 *) binary & 0xFFFF; }
static int binary_hash_4(void *binary) { return *(ARCH_WORD_32 *) binary & 0xFFFFF; }
static int binary_hash_5(void *binary) { return *(ARCH_WORD_32 *) binary & 0xFFFFFF; }
static int binary_hash_6(void *binary) { return *(ARCH_WORD_32 *) binary & 0x7FFFFFF; }

static int get_hash_0(int index) { return partial_hashes[index] & 0x0F; }
static int get_hash_1(int index) { return partial_hashes[index] & 0xFF; }
static int get_hash_2(int index) { return partial_hashes[index] & 0xFFF; }
static int get_hash_3(int index) { return partial_hashes[index] & 0xFFFF; }
static int get_hash_4(int index) { return partial_hashes[index] & 0xFFFFF; }
static int get_hash_5(int index) { return partial_hashes[index] & 0xFFFFFF; }
static int get_hash_6(int index) { return partial_hashes[index] & 0x7FFFFFF; }

static void set_salt(void *salt) { }

static void set_key(char *key, int index) {
	int length = -1;
	int base = index * (PLAINTEXT_LENGTH + 1);
	do {
		length++;
		saved_plain[base + length] = key[length];
	}
	while (key[length]);
	memset(&saved_plain[base + length + 1], 0, 7);	// ugly hack which "should" work!
}

static char *get_key(int index) {
	int length = -1;
	int base = index * (PLAINTEXT_LENGTH + 1);
	do {
		length++;
		get_key_saved[length] = saved_plain[base + length];
	}
	while (get_key_saved[length]);
	get_key_saved[length] = 0;
	return get_key_saved;
}

static void crypt_all(int count)
{
#ifdef DEBUGVERBOSE
	int i, j;
	unsigned char *p = (unsigned char *) saved_plain;
	count--;
	for (i = 0; i < count + 1; i++) {
		printf("\npassword : ");
		for (j = 0; j < 64; j++) {
			printf("%02x ", p[i * 64 + j]);
		}
	}
	printf("\n");
#endif
	// copy keys to the device
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_keys, CL_TRUE, 0,
	    (PLAINTEXT_LENGTH + 1) * max_keys_per_crypt, saved_plain, 0, NULL, NULL),
	    "failed in clEnqueueWriteBuffer saved_buffer");

	HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL,
	    &global_work_size, &local_work_size, 0, NULL, NULL),
	    "failed in clEnqueueNDRangeKernel");
	HANDLE_CLERROR(clFinish(queue[gpu_id]),"failed in clFinish");
	// read back partial hashes
	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_TRUE, 0,
	    sizeof(cl_uint) * max_keys_per_crypt, partial_hashes, 0, NULL, NULL),
	    "failed in reading data back");
	have_full_hashes = 0;

#ifdef DEBUGVERBOSE
	p = (unsigned char *) partial_hashes;
	for (i = 0; i < 2; i++) {
		printf("\n\npartial_hashes : ");
		for (j = 0; j < 16; j++)
			printf("%02x ", p[i * 16 + j]);
	}
	printf("\n");;
#endif
}

static int cmp_one(void *binary, int index){
	unsigned int *t = (unsigned int *) binary;

	if (t[0] == partial_hashes[index])
		return 1;
	return 0;
}

static int cmp_all(void *binary, int count) {
	unsigned int i = 0;
	unsigned int b = ((unsigned int *) binary)[0];
	for (; i < count; i++)
		if (b == partial_hashes[i])
			return 1;
	return 0;
}

static int cmp_exact(char *source, int count){
	unsigned int *t = (unsigned int *) get_binary(source);

	if (!have_full_hashes){
	clEnqueueReadBuffer(queue[gpu_id], buffer_out, CL_TRUE, 
		sizeof(cl_uint) * (max_keys_per_crypt), 
		sizeof(cl_uint) * 3 * max_keys_per_crypt, res_hashes, 0,
		NULL, NULL); 
		have_full_hashes = 1;
	}
        
	if (t[1]!=res_hashes[count])
		return 0;
	if (t[2]!=res_hashes[1*max_keys_per_crypt+count])
		return 0;
	if (t[3]!=res_hashes[2*max_keys_per_crypt+count])
		return 0;
	return 1;
}

struct fmt_main fmt_opencl_rawMD5 = {
	{
		FORMAT_LABEL,
		FORMAT_NAME,
		ALGORITHM_NAME,
		BENCHMARK_COMMENT,
		BENCHMARK_LENGTH,
		PLAINTEXT_LENGTH,
		BINARY_SIZE,
		SALT_SIZE,
		MIN_KEYS_PER_CRYPT,
		MAX_KEYS_PER_CRYPT,
		FMT_CASE | FMT_8_BIT,
	tests}, {
		fmt_MD5_init,
		fmt_default_prepare,
		valid,
		split,
		get_binary,
		fmt_default_salt,
		{
		binary_hash_0,
		binary_hash_1,
		binary_hash_2,
		binary_hash_3,
		binary_hash_4,
		binary_hash_5,
		binary_hash_6},
		fmt_default_salt_hash,
		set_salt,
		set_key,
		get_key,
		fmt_default_clear_keys,
		crypt_all,
		{
		get_hash_0,
		get_hash_1,
		get_hash_2,
		get_hash_3,
		get_hash_4,
		get_hash_5,
		get_hash_6},
		cmp_all,
		cmp_one,
	cmp_exact}
};

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ