Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 3 Sep 2015 23:36:28 +0200
From: Lukas Odzioba <lukas.odzioba@...il.com>
To: john-dev@...ts.openwall.com
Subject: Re: md5crypt-opencl

2015-09-02 19:32 GMT+02:00 Lukas Odzioba <lukas.odzioba@...il.com>:
> 2015-09-02 19:04 GMT+02:00 Solar Designer <solar@...nwall.com>:
>> Oh, is it possibly because the array is two-dimensional?  Like some
>> heuristic: "put all arrays with greater than one dimension in global
>> memory".  It is probably worth trying to turn the array into
>> single-dimensional and see.
>
> Who knows, I'll be happy to give it a try.

Performace is the same with 1 dimensional array, so I suppose that's
not the way to go, but the code is not slightly simpler.
>From what I recall there was no way to fit all ctx's with decent LWS value.
Since some ctx's are more often used than the others my idea was to
move those hot to the local memory and keep the rest in global.
Another loose idea was to try to "preload" next ctx to the local
memory and do writeback after that, but I have no idea whether it
makes sense at all with not so long computations as we have in md5.

Here is my patch, but I suppose it will be easier to modify current
code and we should keep it.
I am affraid that gmail web interface will break white characters...

>From 5e33e933d1a3226d8edf52c50390a2905ae64639 Mon Sep 17 00:00:00 2001
From: ukasz <lukas.odzioba@...il.com>
Date: Thu, 3 Sep 2015 23:27:44 +0200
Subject: [PATCH] Changed md5_ctx[8] to 1-dim table

---
 src/opencl/cryptmd5_kernel.cl | 124 ++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 64 deletions(-)

diff --git a/src/opencl/cryptmd5_kernel.cl b/src/opencl/cryptmd5_kernel.cl
index fc2b014..f387ede 100644
--- a/src/opencl/cryptmd5_kernel.cl
+++ b/src/opencl/cryptmd5_kernel.cl
@@ -106,10 +106,6 @@ typedef struct {
  uint v[4]; /** 128 bits **/
 } crypt_md5_hash;

-typedef struct {
- uint buffer[16];
-} md5_ctx;
-
 __constant uchar cl_md5_salt_prefix[] = "$1$";
 __constant uchar cl_apr1_salt_prefix[] = "$apr1$";
 __constant uchar g[] =
@@ -205,52 +201,50 @@ inline void buf_update(uint * buf, uint a, uint
b, uint c, uint d, uint offset)
 }
 #endif

-inline void ctx_update(md5_ctx * ctx, uchar * string, uint len,
+inline void ctx_update(uint *ctx_buffer, uchar * string, uint len,
     uint * ctx_buflen)
 {
  uint i;

  for (i = 0; i < len; i++)
- PUTCHAR(ctx->buffer, *ctx_buflen + i, string[i]);
+ PUTCHAR(ctx_buffer, *ctx_buflen + i, string[i]);

  *ctx_buflen += len;
 }

-inline void ctx_update_prefix(md5_ctx * ctx, uchar prefix, uint * ctx_buflen)
+inline void ctx_update_prefix(uint *ctx_buffer, uchar prefix, uint *
ctx_buflen)
 {
  uint i;

  if (prefix == '1') {
  for (i = 0; i < 3; i++)
- PUTCHAR(ctx->buffer, *ctx_buflen + i,
+ PUTCHAR(ctx_buffer, *ctx_buflen + i,
     cl_md5_salt_prefix[i]);
  *ctx_buflen += 3;
  } else if (prefix == 'a') {
  for (i = 0; i < 6; i++)
- PUTCHAR(ctx->buffer, *ctx_buflen + i,
+ PUTCHAR(ctx_buffer, *ctx_buflen + i,
     cl_apr1_salt_prefix[i]);
  *ctx_buflen += 6;
  }
  // else if (prefix == '\0') do nothing. for {smd5}
 }

-inline void init_ctx(md5_ctx * ctx, uint * ctx_buflen)
+inline void init_ctx(uint *ctx_buffer, uint * ctx_buflen)
 {
  uint i;
- uint *buf = (uint *) ctx->buffer;

 #ifdef NVIDIA
 #pragma unroll 4
 #endif
- for (i = 0; i < sizeof(ctx->buffer) / 4; i++)
- *buf++ = 0;
+ for (i = 0; i < 16; i++)
+ *ctx_buffer++ = 0;
  *ctx_buflen = 0;
 }

-inline void md5_digest(md5_ctx * ctx, uint * result, uint len,
+inline void md5_digest(uint *x, uint * result, uint len,
     uint res_offset)
 {
- uint *x = ctx->buffer;
  uint a;
  uint b = 0xefcdab89;
  uint c = 0x98badcfe;
@@ -351,7 +345,7 @@ __kernel void cryptmd5(__global const
crypt_md5_password * inbuffer,
  uint pass_len = inbuffer[idx].length;
  uint salt_len = hsalt->saltlen;
  uint alt_result[4];
- md5_ctx ctx[8];
+ uint ctx_buffers[8*16];//8 buffers 16 uints each
  uint ctx_buflen[8];
  union {
  uint w[4];
@@ -371,28 +365,30 @@ __kernel void cryptmd5(__global const
crypt_md5_password * inbuffer,

  salt.w[0] = ((__global uint *) & hsalt->salt)[0];
  salt.w[1] = ((__global uint *) & hsalt->salt)[1];
+#define CTX(i) &ctx_buffers[i*16]
+ init_ctx(CTX(1), &ctx_buflen[1]);
+ ctx_update(CTX(1), pass.c, pass_len, &ctx_buflen[1]);
+ ctx_update(CTX(1), salt.c, salt_len, &ctx_buflen[1]);
+ ctx_update(CTX(1), pass.c, pass_len, &ctx_buflen[1]);
+
+ PUTCHAR(CTX(1), ctx_buflen[1], 0x80);
+
+ md5_digest(CTX(1), alt_result, ctx_buflen[1] << 3, 0);

- init_ctx(&ctx[1], &ctx_buflen[1]);
- ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]);
- ctx_update(&ctx[1], salt.c, salt_len, &ctx_buflen[1]);
- ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]);
- PUTCHAR(ctx[1].buffer, ctx_buflen[1], 0x80);
- md5_digest(&ctx[1], alt_result, ctx_buflen[1] << 3, 0);
-
- init_ctx(&ctx[1], &ctx_buflen[1]);
- ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]);
- ctx_update_prefix(&ctx[1], hsalt->prefix, &ctx_buflen[1]);
- ctx_update(&ctx[1], salt.c, salt_len, &ctx_buflen[1]);
+ init_ctx(CTX(1), &ctx_buflen[1]);
+ ctx_update(CTX(1), pass.c, pass_len, &ctx_buflen[1]);
+ ctx_update_prefix(CTX(1), hsalt->prefix, &ctx_buflen[1]);
+ ctx_update(CTX(1), salt.c, salt_len, &ctx_buflen[1]);
 #if PLAINTEXT_LENGTH >= 16
  for (i = pass_len; i > 16; i -= 16)
- ctx_update(&ctx[1], (uchar *) alt_result, 16, &ctx_buflen[1]);
- ctx_update(&ctx[1], (uchar *) alt_result, i, &ctx_buflen[1]);
+ ctx_update(CTX(1), (uchar *) alt_result, 16, &ctx_buflen[1]);
+ ctx_update(CTX(1), (uchar *) alt_result, i, &ctx_buflen[1]);
 #else
- ctx_update(&ctx[1], (uchar *) alt_result, pass_len, &ctx_buflen[1]);
+ ctx_update(CTX(1), (uchar *) alt_result, pass_len, &ctx_buflen[1]);
 #endif
  for (i = pass_len; i > 0; i >>= 1) {
  uchar c = (i & 1) ? 0 : pass.c[0];
- PUTCHAR(ctx[1].buffer, ctx_buflen[1], c);
+ PUTCHAR(CTX(1), ctx_buflen[1], c);
  ctx_buflen[1]++;
  }

@@ -412,53 +408,53 @@ __kernel void cryptmd5(__global const
crypt_md5_password * inbuffer,
  altpos[3] = altpos[1] + salt_len;

  //prepare pattern buffers
- init_ctx(&ctx[0], &ctx_buflen[0]);
- PUTCHAR(ctx[1].buffer, ctx_buflen[1], 0x80);
+ init_ctx(CTX(0), &ctx_buflen[0]);
+ PUTCHAR(CTX(1), ctx_buflen[1], 0x80);
  //alt pass
- md5_digest(&ctx[1], ctx[0].buffer, ctx_buflen[1] << 3, 0); //add
results from init
+ md5_digest(CTX(1), CTX(0), ctx_buflen[1] << 3, 0); //add results from init
  ctx_buflen[0] = 16;
  for (i = 1; i < 8; i++) //1 not 0
- init_ctx(&ctx[i], &ctx_buflen[i]);
+ init_ctx(CTX(i), &ctx_buflen[i]);

- ctx_update(&ctx[0], pass.c, pass_len, &ctx_buflen[0]);
- PUTCHAR(ctx[0].buffer, ctx_buflen[0], 0x80);
+ ctx_update(CTX(0), pass.c, pass_len, &ctx_buflen[0]);
+ PUTCHAR(CTX(0), ctx_buflen[0], 0x80);

  //alt pass pass
  ctx_buflen[1] = 16;
- ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]);
- ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]);
- PUTCHAR(ctx[1].buffer, ctx_buflen[1], 0x80);
+ ctx_update(CTX(1), pass.c, pass_len, &ctx_buflen[1]);
+ ctx_update(CTX(1), pass.c, pass_len, &ctx_buflen[1]);
+ PUTCHAR(CTX(1), ctx_buflen[1], 0x80);
  //alt salt pass
  ctx_buflen[2] = 16;
- ctx_update(&ctx[2], salt.c, salt_len, &ctx_buflen[2]);
- ctx_update(&ctx[2], pass.c, pass_len, &ctx_buflen[2]);
- PUTCHAR(ctx[2].buffer, ctx_buflen[2], 0x80);
+ ctx_update(CTX(2), salt.c, salt_len, &ctx_buflen[2]);
+ ctx_update(CTX(2), pass.c, pass_len, &ctx_buflen[2]);
+ PUTCHAR(CTX(2), ctx_buflen[2], 0x80);
  //alt salt pass pass
  ctx_buflen[3] = 16;
- ctx_update(&ctx[3], salt.c, salt_len, &ctx_buflen[3]);
- ctx_update(&ctx[3], pass.c, pass_len, &ctx_buflen[3]);
- ctx_update(&ctx[3], pass.c, pass_len, &ctx_buflen[3]);
- PUTCHAR(ctx[3].buffer, ctx_buflen[3], 0x80);
+ ctx_update(CTX(3), salt.c, salt_len, &ctx_buflen[3]);
+ ctx_update(CTX(3), pass.c, pass_len, &ctx_buflen[3]);
+ ctx_update(CTX(3), pass.c, pass_len, &ctx_buflen[3]);
+ PUTCHAR(CTX(3), ctx_buflen[3], 0x80);
  //pass alt
- ctx_update(&ctx[4], pass.c, pass_len, &ctx_buflen[4]);
+ ctx_update(CTX(4), pass.c, pass_len, &ctx_buflen[4]);
  ctx_buflen[4] += 16;
- PUTCHAR(ctx[4].buffer, ctx_buflen[4], 0x80);
+ PUTCHAR(CTX(4), ctx_buflen[4], 0x80);
  //pass pass alt
- ctx_update(&ctx[5], pass.c, pass_len, &ctx_buflen[5]);
- ctx_update(&ctx[5], pass.c, pass_len, &ctx_buflen[5]);
+ ctx_update(CTX(5), pass.c, pass_len, &ctx_buflen[5]);
+ ctx_update(CTX(5), pass.c, pass_len, &ctx_buflen[5]);
  ctx_buflen[5] += 16;
- PUTCHAR(ctx[5].buffer, ctx_buflen[5], 0x80);
+ PUTCHAR(CTX(5), ctx_buflen[5], 0x80);
  //pass salt alt
- ctx_update(&ctx[6], pass.c, pass_len, &ctx_buflen[6]);
- ctx_update(&ctx[6], salt.c, salt_len, &ctx_buflen[6]);
+ ctx_update(CTX(6), pass.c, pass_len, &ctx_buflen[6]);
+ ctx_update(CTX(6), salt.c, salt_len, &ctx_buflen[6]);
  ctx_buflen[6] += 16;
- PUTCHAR(ctx[6].buffer, ctx_buflen[6], 0x80);
+ PUTCHAR(CTX(6), ctx_buflen[6], 0x80);
  //pass salt pass alt
- ctx_update(&ctx[7], pass.c, pass_len, &ctx_buflen[7]);
- ctx_update(&ctx[7], salt.c, salt_len, &ctx_buflen[7]);
- ctx_update(&ctx[7], pass.c, pass_len, &ctx_buflen[7]);
+ ctx_update(CTX(7), pass.c, pass_len, &ctx_buflen[7]);
+ ctx_update(CTX(7), salt.c, salt_len, &ctx_buflen[7]);
+ ctx_update(CTX(7), pass.c, pass_len, &ctx_buflen[7]);
  ctx_buflen[7] += 16;
- PUTCHAR(ctx[7].buffer, ctx_buflen[7], 0x80);
+ PUTCHAR(CTX(7), ctx_buflen[7], 0x80);

 #ifdef NVIDIA
 #pragma unroll 8
@@ -475,22 +471,22 @@ __kernel void cryptmd5(__global const
crypt_md5_password * inbuffer,
  for (i = 0; i < 500; i++) {
 #endif
  id2 = g[j];
- md5_digest(&ctx[id1], ctx[id2].buffer, ctx_buflen[id1],
+ md5_digest(CTX(id1), CTX(id2), ctx_buflen[id1],
     altpos[id2 - 4]);
  if (j == 41)
  j = (uint)-1;
  id1 = g[j + 1];
- md5_digest(&ctx[id2], ctx[id1].buffer, ctx_buflen[id2], 0);
+ md5_digest(CTX(id2), CTX(id1), ctx_buflen[id2], 0);

 #ifdef NVIDIA
  id2 = g[j + 2];
- md5_digest(&ctx[id1], ctx[id2].buffer, ctx_buflen[id1],
+ md5_digest(CTX(id1), CTX(id2), ctx_buflen[id1],
     altpos[id2 - 4]);
  if (j == 39)
  j = (uint)-3;
  id1 = g[j + 3];
  j += 4;
- md5_digest(&ctx[id2], ctx[id1].buffer, ctx_buflen[id2], 0);
+ md5_digest(CTX(id2), CTX(id1), ctx_buflen[id2], 0);
 #else
  j += 2;
 #endif
@@ -500,5 +496,5 @@ __kernel void cryptmd5(__global const
crypt_md5_password * inbuffer,
 #pragma unroll 4
 #endif
  for (i = 0; i < 4; i++)
- outbuffer[idx].v[i] = ctx[3].buffer[i];
+ outbuffer[idx].v[i] = ctx_buffers[3*16+i];
 }
-- 
1.9.1


Thanks,
Lukas

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.