diff --git a/src/common-opencl.c b/src/common-opencl.c index b897913..ebba9fb 100644 --- a/src/common-opencl.c +++ b/src/common-opencl.c @@ -1867,7 +1867,7 @@ void opencl_build_kernel_opt(char *kernel_filename, int sequential_id, char *opts) { opencl_read_source(kernel_filename); - opencl_build(sequential_id, opts, 0, NULL); + opencl_build(sequential_id, opts, 1, "kernel.out"); } void opencl_build_kernel(char *kernel_filename, int sequential_id, char *opts, diff --git a/src/opencl/argon2d_kernel.cl b/src/opencl/argon2d_kernel.cl index 9e0a515..b5e2ae6 100755 --- a/src/opencl/argon2d_kernel.cl +++ b/src/opencl/argon2d_kernel.cl @@ -62,6 +62,7 @@ static int blake2b_long(uchar *out, const void *in, const uint outlen, const ulo memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); out += BLAKE2B_OUTBYTES / 2; toproduce = outlen - BLAKE2B_OUTBYTES / 2; +#pragma unroll 1 while (toproduce > BLAKE2B_OUTBYTES) { memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); @@ -82,17 +83,19 @@ static int blake2b_long(uchar *out, const void *in, const uint outlen, const ulo static void ComputeBlock_pgg(ulong2 *state, __global ulong2 *ref_block_ptr, __global ulong2 *next_block_ptr) { ulong2 ref_block[64]; - uchar i; + uint i; ulong2 t0,t1; uchar16 r16 = (uchar16) (2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); uchar16 r24 = (uchar16) (3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); +#pragma unroll 1 for (i = 0; i < 64; i++) { ref_block[i] = ref_block_ptr[MAP(i)]; } +#pragma unroll 1 for (i = 0; i < 64; i++) { ref_block[i] = state[i] = state[i] ^ ref_block[i]; //XORing the reference block to the state and storing the copy of the result @@ -101,62 +104,31 @@ static void ComputeBlock_pgg(ulong2 *state, __global ulong2 *ref_block_ptr, __gl // BLAKE2 - begin - BLAKE2_ROUND_NO_MSG_V(state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7]); - - BLAKE2_ROUND_NO_MSG_V(state[8], state[9], state[10], state[11], - state[12], state[13], state[14], state[15]); - - BLAKE2_ROUND_NO_MSG_V(state[16], state[17], state[18], state[19], - state[20], state[21], state[22], state[23]); - - BLAKE2_ROUND_NO_MSG_V(state[24], state[25], state[26], state[27], - state[28], state[29], state[30], state[31]); - - BLAKE2_ROUND_NO_MSG_V(state[32], state[33], state[34], state[35], - state[36], state[37], state[38], state[39]); - - BLAKE2_ROUND_NO_MSG_V(state[40], state[41], state[42], state[43], - state[44], state[45], state[46], state[47]); - - BLAKE2_ROUND_NO_MSG_V(state[48], state[49], state[50], state[51], - state[52], state[53], state[54], state[55]); - - BLAKE2_ROUND_NO_MSG_V(state[56], state[57], state[58], state[59], - state[60], state[61], state[62], state[63]); - - - BLAKE2_ROUND_NO_MSG_V(state[0], state[8], state[16], state[24], - state[32], state[40], state[48], state[56]); - - BLAKE2_ROUND_NO_MSG_V(state[1], state[9], state[17], state[25], - state[33], state[41], state[49], state[57]); - - BLAKE2_ROUND_NO_MSG_V(state[2], state[10], state[18], state[26], - state[34], state[42], state[50], state[58]); - - BLAKE2_ROUND_NO_MSG_V(state[3], state[11], state[19], state[27], - state[35], state[43], state[51], state[59]); - - BLAKE2_ROUND_NO_MSG_V(state[4], state[12], state[20], state[28], - state[36], state[44], state[52], state[60]); - - BLAKE2_ROUND_NO_MSG_V(state[5], state[13], state[21], state[29], - state[37], state[45], state[53], state[61]); - - BLAKE2_ROUND_NO_MSG_V(state[6], state[14], state[22], state[30], - state[38], state[46], state[54], state[62]); +#pragma unroll 1 + for (i = 0; i < 64; i += 8) { + BLAKE2_ROUND_NO_MSG_V(state[i], state[i+1], + state[i+2], state[i+3], + state[i+4], state[i+5], + state[i+6], state[i+7]); + } - BLAKE2_ROUND_NO_MSG_V(state[7], state[15], state[23], state[31], - state[39], state[47], state[55], state[63]); +#pragma unroll 1 + for (i = 0; i < 8; i++) { + BLAKE2_ROUND_NO_MSG_V(state[i], state[i+8], + state[i+16], state[i+24], + state[i+32], state[i+40], + state[i+48], state[i+56]); + } // BLAKE2 - end +#pragma unroll 1 for (i = 0; i< 64; i++) { state[i] = state[i] ^ ref_block[i]; //Feedback } +#pragma unroll 1 for (i = 0; i< 64; i++) { next_block_ptr[MAP(i)]=state[i]; @@ -173,15 +145,18 @@ static void Initialize(scheme_info_t* info,uchar* input_hash) uint segment_length = (info->mem_size / (SYNC_POINTS*(info->lanes))); memcpy(block_input, input_hash, BLAKE_INPUT_HASH_SIZE); memset(block_input + BLAKE_INPUT_HASH_SIZE, 0, 8); +#pragma unroll 1 for (l = 0; l < info->lanes; ++l) { block_input[BLAKE_INPUT_HASH_SIZE + 4] = l; block_input[BLAKE_INPUT_HASH_SIZE] = 0; blake2b_long((uchar*)out_tmp, block_input, BLOCK_SIZE, BLAKE_INPUT_HASH_SIZE + 8); +#pragma unroll 1 for(i=0;i -- temporary variable for loading previous block +#pragma unroll 1 for (i = 0; i < 64; i++) { prev_block[i] = memory[MAP(bi+i)]; @@ -259,6 +238,7 @@ static void FillSegment(scheme_info_t *info, position_info_t pos) prev_block_recalc = (pos.slice > 0) ? ((pos.slice - 1)*lanes*segment_length) : (SYNC_POINTS - 2)*lanes*segment_length; bi = ((prev_slice * lanes + pos.lane + 1) * segment_length - 1) * BLOCK_SIZE / 16;// -- temporary variable for loading previous block +#pragma unroll 1 for (i = 0; i < 64; i++) { prev_block[i] = memory[MAP(bi+i)]; @@ -268,6 +248,7 @@ static void FillSegment(scheme_info_t *info, position_info_t pos) } next_block_offset = ((pos.slice*lanes + pos.lane)*segment_length + start)*BLOCK_SIZE; +#pragma unroll 1 for(i = start; i < segment_length; i++) { // Compute block2 index @@ -322,12 +303,15 @@ static void FillMemory(scheme_info_t* info)//Main loop: filling memory uint p,s,t; position_info_t position; position_info_t_init(&position,0,0,0,0); +#pragma unroll 1 for (p = 0; p < info->passes; p++) { position.pass = p; +#pragma unroll 1 for (s = 0; s < SYNC_POINTS; s++) { position.slice = s; +#pragma unroll 1 for (t = 0; t < info->lanes; t++) { position.lane = t; @@ -454,10 +438,12 @@ __kernel void argon2d_crypt_kernel( memory+=gid*(((ulong)m_cost)<<10)/sizeof(ulong2); //copying password +#pragma unroll 1 for(i=0;isalt[i]; diff --git a/src/opencl_blake2.h b/src/opencl_blake2.h index 2c23238..904b97f 100644 --- a/src/opencl_blake2.h +++ b/src/opencl_blake2.h @@ -104,7 +104,8 @@ typedef struct __blake2bp_state { //from blake-impl.h static inline ulong rotr64(const ulong w, const uint c) { - return (w >> c) | (w << (64 - c)); +// return (w >> c) | (w << (64 - c)); + return rotate(w, (ulong)(64 - c)); } __constant ulong blake2b_IV[8] = {