diff --git a/src/opencl/cryptmd5_kernel.cl b/src/opencl/cryptmd5_kernel.cl index 34d7667..2380d45 100644 --- a/src/opencl/cryptmd5_kernel.cl +++ b/src/opencl/cryptmd5_kernel.cl @@ -10,6 +10,49 @@ #include "opencl_device_info.h" #include "opencl_misc.h" +#ifdef cl_nv_pragma_unroll +#define NVIDIA +#endif + +#if nvidia_sm_3x(DEVICE_INFO) +#define USE_BITSELECT 1 +#endif + +#if gpu_amd(DEVICE_INFO) +#define BITALIGN(hi, lo, s) amd_bitalign((hi), (lo), (s)) +#else +#if 0 /* Need to check for sm_32 or better to enable this */ +static inline uint funnel_shift_right(uint hi, uint lo, uint s) { + uint r; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r" (r) + : "r" (lo), "r" (hi), "r" (s)); + return r; +} +static inline uint funnel_shift_right_imm(uint hi, uint lo, uint s) { + uint r; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r" (r) + : "r" (lo), "r" (hi), "i" (s)); + return r; +} +#define BITALIGN(hi, lo, s) funnel_shift_right(hi, lo, s) +#define BITALIGN_IMM(hi, lo, s) funnel_shift_right_imm(hi, lo, s) +#else +#define BITALIGN(hi, lo, s) (((hi) << (32 - (s))) | ((lo) >> (s))) +#endif +#endif + +#ifndef BITALIGN_IMM +#define BITALIGN_IMM(hi, lo, s) BITALIGN(hi, lo, s) +#endif + +#ifdef NVIDIA +#define BITALIGN_AGGRESSIVE +#else +#define BUF_UPDATE_SWITCH +#endif + #define ROTATE_LEFT(x, s) rotate(x, (uint)s) #ifdef USE_BITSELECT @@ -94,47 +137,100 @@ typedef struct { __constant uchar cl_md5_salt_prefix[] = "$1$"; __constant uchar cl_apr1_salt_prefix[] = "$apr1$"; -__constant uint g[] = +__constant uchar g[] = { 0, 7, 3, 5, 3, 7, 1, 6, 3, 5, 3, 7, 1, 7, 2, 5, 3, 7, 1, 7, 3, 4, 3, 7, 1, 7, 3, 5, 2, 7, 1, 7, 3, 5, 3, 6, 1, 7, 3, 5, 3, 7 }; -inline void buf_update(uint * buf, uint a, uint b, uint c, uint d, uint offset) +#ifdef BUF_UPDATE_SWITCH +static inline void buf_update(uint * buf, uint a, uint b, uint c, uint d, uint offset) { - switch (offset % 4) { + uint i = offset >> 2; + switch (offset & 3) { case 0: - buf[offset / 4] = a; - buf[(offset / 4) + 1] = b; - buf[(offset / 4) + 2] = c; - buf[(offset / 4) + 3] = d; + buf[i] = a; + buf[i + 1] = b; + buf[i + 2] = c; + buf[i + 3] = d; break; case 1: - buf[offset / 4] = (buf[offset / 4] & 0x000000ff) | (a << 8); - buf[(offset / 4) + 1] = (a >> 24) | (b << 8); - buf[(offset / 4) + 2] = (b >> 24) | (c << 8); - buf[(offset / 4) + 3] = (c >> 24) | (d << 8); - buf[(offset / 4) + 4] = - (buf[(offset / 4) + 4] & 0xffffff00) | (d >> 24); +#ifdef BITALIGN_AGGRESSIVE + buf[i] = BITALIGN_IMM(a, buf[i] << 24, 24); +#else + buf[i] = (buf[i] & 0x000000ff) | (a << 8); +#endif + buf[i + 1] = BITALIGN_IMM(b, a, 24); + buf[i + 2] = BITALIGN_IMM(c, b, 24); + buf[i + 3] = BITALIGN_IMM(d, c, 24); +#ifdef BITALIGN_AGGRESSIVE + buf[i + 4] = BITALIGN_IMM(buf[i + 4] >> 8, d, 24); +#else + buf[i + 4] = (buf[i + 4] & 0xffffff00) | (d >> 24); +#endif break; case 2: - buf[offset / 4] = (buf[offset / 4] & 0x0000ffff) | (a << 16); - buf[(offset / 4) + 1] = (a >> 16) | (b << 16); - buf[(offset / 4) + 2] = (b >> 16) | (c << 16); - buf[(offset / 4) + 3] = (c >> 16) | (d << 16); - buf[(offset / 4) + 4] = - (buf[(offset / 4) + 4] & 0xffff0000) | (d >> 16); +#ifdef BITALIGN_AGGRESSIVE + buf[i] = BITALIGN_IMM(a, buf[i] << 16, 16); +#else + buf[i] = (buf[i] & 0x0000ffff) | (a << 16); +#endif + buf[i + 1] = BITALIGN_IMM(b, a, 16); + buf[i + 2] = BITALIGN_IMM(c, b, 16); + buf[i + 3] = BITALIGN_IMM(d, c, 16); +#ifdef BITALIGN_AGGRESSIVE + buf[i + 4] = BITALIGN_IMM(buf[i + 4] >> 16, d, 16); +#else + buf[i + 4] = (buf[i + 4] & 0xffff0000) | (d >> 16); +#endif break; case 3: - buf[offset / 4] = (buf[offset / 4] & 0x00ffffff) | (a << 24); - buf[(offset / 4) + 1] = (a >> 8) | (b << 24); - buf[(offset / 4) + 2] = (b >> 8) | (c << 24); - buf[(offset / 4) + 3] = (c >> 8) | (d << 24); - buf[(offset / 4) + 4] = - (buf[(offset / 4) + 4] & 0xff000000) | (d >> 8); +#ifdef BITALIGN_AGGRESSIVE + buf[i] = BITALIGN_IMM(a, buf[i] << 8, 8); +#else + buf[i] = (buf[i] & 0x00ffffff) | (a << 24); +#endif + buf[i + 1] = BITALIGN_IMM(b, a, 8); + buf[i + 2] = BITALIGN_IMM(c, b, 8); + buf[i + 3] = BITALIGN_IMM(d, c, 8); +#ifdef BITALIGN_AGGRESSIVE + buf[i + 4] = BITALIGN_IMM(buf[i + 4] >> 24, d, 8); +#else + buf[i + 4] = (buf[i + 4] & 0xff000000) | (d >> 8); +#endif break; } } +#else +static inline void buf_update(uint * buf, uint a, uint b, uint c, uint d, uint offset) +{ + uint i = offset >> 2; + uint j = offset & 3; + if (!j) { + buf[i] = a; + buf[i + 1] = b; + buf[i + 2] = c; + buf[i + 3] = d; + return; + } -inline void ctx_update(md5_ctx * ctx, uchar * string, uint len, + j <<= 3; + uint k = 32 - j; +#ifdef BITALIGN_AGGRESSIVE + buf[i] = BITALIGN(a, buf[i] << k, k); +#else + buf[i] = (buf[i] & (0xffffffffU >> k)) | (a << j); +#endif + buf[i + 1] = BITALIGN(b, a, k); + buf[i + 2] = BITALIGN(c, b, k); + buf[i + 3] = BITALIGN(d, c, k); +#ifdef BITALIGN_AGGRESSIVE + buf[i + 4] = BITALIGN(buf[i + 4] >> j, d, k); +#else + buf[i + 4] = (buf[i + 4] & (0xffffffffU << j)) | (d >> k); +#endif +} +#endif + +static inline void ctx_update(md5_ctx * ctx, uchar * string, uint len, uint * ctx_buflen) { uint i; @@ -145,7 +241,7 @@ inline void ctx_update(md5_ctx * ctx, uchar * string, uint len, *ctx_buflen += len; } -inline void ctx_update_prefix(md5_ctx * ctx, uchar prefix, uint * ctx_buflen) +static inline void ctx_update_prefix(md5_ctx * ctx, uchar prefix, uint * ctx_buflen) { uint i; @@ -163,28 +259,28 @@ inline void ctx_update_prefix(md5_ctx * ctx, uchar prefix, uint * ctx_buflen) // else if (prefix == '\0') do nothing. for {smd5} } -inline void init_ctx(md5_ctx * ctx, uint * ctx_buflen) +static inline void init_ctx(md5_ctx * ctx, uint * ctx_buflen) { uint i; uint *buf = (uint *) ctx->buffer; +#ifdef NVIDIA +#pragma unroll 4 +#endif for (i = 0; i < sizeof(ctx->buffer) / 4; i++) *buf++ = 0; *ctx_buflen = 0; } -inline void md5_digest(md5_ctx * ctx, uint * result, uint * ctx_buflen, +static void md5_digest(md5_ctx * ctx, uint * result, uint len, uint res_offset) { - uint len = *ctx_buflen; uint *x = ctx->buffer; uint a; uint b = 0xefcdab89; uint c = 0x98badcfe; uint d = 0x10325476; - len <<= 3; - a = ROTATE_LEFT(AC1 + x[0], S11); a += b; /* 1 */ d = ROTATE_LEFT((c ^ (a & MASK1)) + x[1] + AC2pCd, S12); @@ -262,6 +358,14 @@ inline void md5_digest(md5_ctx * ctx, uint * result, uint * ctx_buflen, c += 0x98badcfe; d += 0x10325476; + if (!res_offset) { + result[0] = a; + result[1] = b; + result[2] = c; + result[3] = d; + return; + } + buf_update(result, a, b, c, d, res_offset); } @@ -284,18 +388,21 @@ __kernel void cryptmd5(__global const crypt_md5_password * inbuffer, } salt; uint i; +#ifdef NVIDIA +#pragma unroll 4 +#endif for (i = 0; i < 4; i++) pass.w[i] = ((__global uint *) & inbuffer[idx].v)[i]; - for (i = 0; i < 2; i++) - salt.w[i] = ((__global uint *) & hsalt->salt)[i]; + salt.w[0] = ((__global uint *) & hsalt->salt)[0]; + salt.w[1] = ((__global uint *) & hsalt->salt)[1]; init_ctx(&ctx[1], &ctx_buflen[1]); ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]); ctx_update(&ctx[1], salt.c, salt_len, &ctx_buflen[1]); ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]); PUTCHAR(ctx[1].buffer, ctx_buflen[1], 0x80); - md5_digest(&ctx[1], alt_result, &ctx_buflen[1], 0); + md5_digest(&ctx[1], alt_result, ctx_buflen[1] << 3, 0); init_ctx(&ctx[1], &ctx_buflen[1]); ctx_update(&ctx[1], pass.c, pass_len, &ctx_buflen[1]); @@ -308,13 +415,11 @@ __kernel void cryptmd5(__global const crypt_md5_password * inbuffer, #else ctx_update(&ctx[1], (uchar *) alt_result, pass_len, &ctx_buflen[1]); #endif - *alt_result = 0; - for (i = pass_len; i > 0; i >>= 1) - if (i & 1) - ctx_update(&ctx[1], (uchar *) alt_result, 1, - &ctx_buflen[1]); - else - ctx_update(&ctx[1], pass.c, 1, &ctx_buflen[1]); + for (i = pass_len; i > 0; i >>= 1) { + uchar c = (i & 1) ? 0 : pass.c[0]; + PUTCHAR(ctx[1].buffer, ctx_buflen[1], c); + ctx_buflen[1]++; + } //pattern[0]=alt pass //pattern[1]=alt pass pass @@ -325,21 +430,17 @@ __kernel void cryptmd5(__global const crypt_md5_password * inbuffer, //pattern[6]=pass salt alt //pattern[7]=pass salt pass alt - uint altpos[8]; - altpos[0] = 0; - altpos[1] = 0; - altpos[2] = 0; - altpos[3] = 0; - altpos[4] = pass_len; - altpos[5] = pass_len * 2; - altpos[6] = pass_len + salt_len; - altpos[7] = pass_len * 2 + salt_len; + uchar altpos[4]; + altpos[0] = pass_len; + altpos[1] = pass_len * 2; + altpos[2] = pass_len + salt_len; + altpos[3] = altpos[1] + salt_len; //prepare pattern buffers init_ctx(&ctx[0], &ctx_buflen[0]); PUTCHAR(ctx[1].buffer, ctx_buflen[1], 0x80); //alt pass - md5_digest(&ctx[1], ctx[0].buffer, &ctx_buflen[1], 0); //add results from init + md5_digest(&ctx[1], ctx[0].buffer, ctx_buflen[1] << 3, 0); //add results from init ctx_buflen[0] = 16; for (i = 1; i < 8; i++) //1 not 0 init_ctx(&ctx[i], &ctx_buflen[i]); @@ -384,14 +485,45 @@ __kernel void cryptmd5(__global const crypt_md5_password * inbuffer, ctx_buflen[7] += 16; PUTCHAR(ctx[7].buffer, ctx_buflen[7], 0x80); - uint nid, cid = g[0]; //current ctx id +#ifdef NVIDIA +#pragma unroll 8 +#endif + for (i = 0; i < 8; i++) + ctx_buflen[i] <<= 3; + + uint id1 = g[0], id2; - for (i = 0; i < 1000; i++) { - nid = g[(i + 1) % 42]; //next ctx id to process - md5_digest(&ctx[cid], ctx[nid].buffer, &ctx_buflen[cid], - altpos[nid]); - cid = nid; + uint j = 1; +#ifdef NVIDIA + for (i = 0; i < 250; i++) { +#else + for (i = 0; i < 500; i++) { +#endif + id2 = g[j]; + md5_digest(&ctx[id1], ctx[id2].buffer, ctx_buflen[id1], + altpos[id2 - 4]); + if (j == 41) + j = (uint)-1; + id1 = g[j + 1]; + md5_digest(&ctx[id2], ctx[id1].buffer, ctx_buflen[id2], 0); + +#ifdef NVIDIA + id2 = g[j + 2]; + md5_digest(&ctx[id1], ctx[id2].buffer, ctx_buflen[id1], + altpos[id2 - 4]); + if (j == 39) + j = (uint)-3; + id1 = g[j + 3]; + j += 4; + md5_digest(&ctx[id2], ctx[id1].buffer, ctx_buflen[id2], 0); +#else + j += 2; +#endif } + +#ifdef NVIDIA +#pragma unroll 4 +#endif for (i = 0; i < 4; i++) outbuffer[idx].v[i] = ctx[3].buffer[i]; }