Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Sun, 12 Aug 2012 03:36:29 +0200
From: magnum <john.magnum@...hmail.com>
To: john-dev@...ts.openwall.com
Subject: Re: pwsafe-gpu

On 2012-08-11 22:53, Lukas Odzioba wrote:
> 2012/8/11 magnum <john.magnum@...hmail.com>:
>> -#define KEYS_PER_CRYPT         1024
>> +#define KEYS_PER_CRYPT         512*112
>>
>> magnum@...l:src [1.7.9-jumbo-6-fixes]$ ../run/john -t -fo:pwsafe-opencl
>> OpenCL platform 0: NVIDIA CUDA, 1 device(s).
>> Using device 0: GeForce GTX 570
>> Benchmarking: Password Safe SHA-256 [OpenCL]... DONE
>> Raw:    128862 c/s real, 128862 c/s virtual
>>
>> I just picked the number used in CUDA - I suppose it can be even better.
>>
>> magnum
> 
> I'll try to make it faster later, now we have more important formats
> that needs tweaking.

Sure.

> Faster OpenCL code is nothing new for me (cl compiler does better job
> here and dummy code is near always faster on OpenCL), after proper
> optimizations they should have similar speed.
> 40% - you meant memset or w[14]=0 ?

The 40% (actually it was >41%) was after applying Solar's patch to
pwsafe-cuda, and then making the opencl code (both kernel and fmt) very
close to the cuda one (running "meld opencl_pwsafe_fmt.c
cuda_pwsafe_fmt.c" and "meld opencl/pwsafe_kernel.cl cuda/pwsafe.cu").
So it was more than just Solar's changes.

magnum

diff --git a/src/opencl/pwsafe_kernel.cl b/src/opencl/pwsafe_kernel.cl
index 403f591..47714ff 100644
--- a/src/opencl/pwsafe_kernel.cl
+++ b/src/opencl/pwsafe_kernel.cl
@@ -78,16 +78,16 @@ __kernel void pwsafe(__global pwsafe_pass * in,
 
 
         uint32_t w[64];
-        for (i = 0; i < 14; i++)
+        for (i = 0; i <= 14; i++)
                 w[i] = 0;
         for (j = 0; j < pl; j++) {
-                uint32_t tmp = 0;
-                tmp |= (((uint32_t) in[idx].v[j]) << ((3 - (j & 0x3)) << 3));
+                uint32_t tmp;
+                tmp = (((uint32_t) in[idx].v[j]) << ((3 - (j & 0x3)) << 3));
                 w[j / 4] |= tmp;
         }
         for (; j < 32 + pl; j++) {
-                uint32_t tmp = 0;
-                tmp |=
+                uint32_t tmp;
+                tmp =
                     (((uint32_t) salt->salt[j - pl]) << ((3 -
                             (j & 0x3)) << 3));
                 w[j / 4] |= tmp;
@@ -122,7 +122,9 @@ __kernel void pwsafe(__global pwsafe_pass * in,
                 b = a;
                 a = t1 + t2;
         }
-
+        w[9] = w[10] = w[11] = w[12] = w[13] = w[14] = 0;
+        w[8] = 0x80000000;
+        w[15] = 0x00000100;
         for (i = 0; i <= salt->iterations; i++) {
                 w[0] = a + H[0];
                 w[1] = b + H[1];
@@ -132,14 +134,6 @@ __kernel void pwsafe(__global pwsafe_pass * in,
                 w[5] = f + H[5];
                 w[6] = g + H[6];
                 w[7] = h + H[7];
-                w[9] = w[10] = w[11] = w[12] = w[13] = w[14] = 0;
-                w[8] = 0x80000000;
-                w[15] = 0x00000100;
-                for (j = 16; j < 64; j++)
-                        w[j] =
-                            sigma1(w[j - 2]) + w[j - 7] + sigma0(w[j - 15]) +
-                            w[j - 16];
-
                 a = H[0];
                 b = H[1];
                 c = H[2];
@@ -148,6 +142,12 @@ __kernel void pwsafe(__global pwsafe_pass * in,
                 f = H[5];
                 g = H[6];
                 h = H[7];
+#pragma unroll 48
+                for (j = 16; j < 64; j++)
+                        w[j] =
+                            sigma1(w[j - 2]) + w[j - 7] + sigma0(w[j - 15]) +
+                            w[j - 16];
+
 #pragma unroll 64
                 for (uint32_t j = 0; j < 64; j++) {
                         uint32_t t1 =
@@ -163,18 +163,20 @@ __kernel void pwsafe(__global pwsafe_pass * in,
                         a = t1 + t2;
                 }
         }
-        uint32_t cmp = 1;
-    
-        __global uint32_t *v =  salt->hash;
-        cmp &= (*v++ == a + H[0]);
-        cmp &= (*v++ == b + H[1]);
-        cmp &= (*v++ == c + H[2]);
-        cmp &= (*v++ == d + H[3]);
-        cmp &= (*v++ == e + H[4]);
-        cmp &= (*v++ == f + H[5]);
-        cmp &= (*v++ == g + H[6]);
-        cmp &= (*v++ == h + H[7]);
 
+        uint32_t cmp = 0;
+        __global uint32_t *v = (__global uint32_t *) salt->hash;
+	if (*v++ == a + H[0]) {
+		uint32_t diff;
+		diff = *v++ ^ (b + H[1]);
+		diff |= *v++ ^ (c + H[2]);
+		diff |= *v++ ^ (d + H[3]);
+		diff |= *v++ ^ (e + H[4]);
+		diff |= *v++ ^ (f + H[5]);
+		diff |= *v++ ^ (g + H[6]);
+		diff |= *v++ ^ (h + H[7]);
+		cmp = !diff;
+	}
         out[idx].cracked = cmp;
 }
 
diff --git a/src/opencl_pwsafe_fmt.c b/src/opencl_pwsafe_fmt.c
index f3ecef4..4c022f7 100644
--- a/src/opencl_pwsafe_fmt.c
+++ b/src/opencl_pwsafe_fmt.c
@@ -33,7 +33,7 @@
 #define PLAINTEXT_LENGTH        15
 #define BINARY_SIZE             32
 #define KERNEL_NAME             "pwsafe"
-#define KEYS_PER_CRYPT		1024
+#define KEYS_PER_CRYPT		512*112
 #define MIN_KEYS_PER_CRYPT      KEYS_PER_CRYPT
 #define MAX_KEYS_PER_CRYPT      KEYS_PER_CRYPT
 # define SWAP32(n) \
@@ -69,7 +69,6 @@ static size_t insize = sizeof(pwsafe_pass) * KEYS_PER_CRYPT;
 static size_t outsize = sizeof(pwsafe_hash) * KEYS_PER_CRYPT;
 static size_t saltsize = sizeof(pwsafe_salt);
 
-static int any_cracked;
 static pwsafe_pass *host_pass;				/** binary ciphertexts **/
 static pwsafe_salt *host_salt;				/** salt **/
 static pwsafe_hash *host_hash;				/** calculated hashes **/
@@ -96,7 +95,6 @@ static void init(struct fmt_main *self)
 	host_pass = calloc(KEYS_PER_CRYPT, sizeof(pwsafe_pass));
 	host_hash = calloc(KEYS_PER_CRYPT, sizeof(pwsafe_hash));
 	host_salt = calloc(1, sizeof(pwsafe_salt));
-	any_cracked = 1;
 
 	opencl_init("$JOHN/pwsafe_kernel.cl", gpu_id, platform_id);
 
@@ -164,17 +162,14 @@ static void *get_salt(char *ciphertext)
 static void set_salt(void *salt)
 {
 	memcpy(host_salt, salt, SALT_SIZE);
-	any_cracked = 0;
 }
 
 
 
 static void crypt_all(int count)
 {
-	int i;
 	size_t worksize = KEYS_PER_CRYPT;
 	size_t localworksize = local_work_size;
-	any_cracked = 0;
 
 //fprintf(stderr, "rounds = %d\n",host_salt->iterations);
 ///Copy data to GPU memory
@@ -194,18 +189,16 @@ static void crypt_all(int count)
 
 	///Await completion of all the above
 	HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
-
-
-	//gpu_pwpass(host_pass, host_salt, host_hash);
-	for (i = 0; i < count; i++) {
-		if (host_hash[i].cracked == 1)
-			any_cracked = 1;
-	}
 }
 
 static int cmp_all(void *binary, int count)
 {
-	return any_cracked;
+	int i;
+
+	for (i = 0; i < count; i++)
+		if (host_hash[i].cracked == 1)
+			return 1;
+	return 0;
 }
 
 static int cmp_one(void *binary, int index)

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ