Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 31 Aug 2015 11:08:18 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: PHC: Argon2 on GPU

On Sun, Aug 30, 2015 at 01:44:32AM +0200, Agnieszka Bielec wrote:
> 2015-08-29 8:48 GMT+02:00 Solar Designer <solar@...nwall.com>:
> > As to loop unrolling, there's "#pragma unroll N", and when you specify
> > N=1 so "#pragma unroll 1" I think it prevents unrolling.  As an
> > experiment, I tried adding "#pragma unroll 1" before all loops in
> > argon2d_kernel.cl, and the PTX instruction count reduced - but not a
> > lot.
> 
> Can I get this code?

Attached, although this is just an experiment.  I think at least the
loops in ComputeBlock_pgg() actually need to stay unrolled, and not
patched like I do here.

I used this experiment to see how much we can reduce the instruction
count.  The conclusion is that we primarily need to look elsewhere,
since the reduction from ~100k to ~80k is just not good enough anyway.

The change to rotr64() that just happened to get into this patch should
get in, though.

> > We need to figure out why it doesn't get lower.  ~80k is still a lot.
> > Are there many inlined functions and unrolled loops in the .h files?
> 
> there are also blake2 files

Yes.

You need to find out how we can reduce the kernel size more
substantially.  If undesirable function inlining can't be prevented,
this may be a reason to replace some multiple references to a function
with a loop containing a single reference to the function.  e.g.:

func(1);
func(2);

may be replaced with:

#pragma unroll 1
for (i = 1; i <= 2; i++)
	func(i);

Of course, in real code things are usually trickier than that, but a
similar approach may often be applied.

All of this is for relatively non performance critical code, so that
we'd have more cache available for the critical code.

Alexander

diff --git a/src/common-opencl.c b/src/common-opencl.c
index b897913..ebba9fb 100644
--- a/src/common-opencl.c
+++ b/src/common-opencl.c
@@ -1867,7 +1867,7 @@ void opencl_build_kernel_opt(char *kernel_filename, int sequential_id,
                              char *opts)
 {
 	opencl_read_source(kernel_filename);
-	opencl_build(sequential_id, opts, 0, NULL);
+	opencl_build(sequential_id, opts, 1, "kernel.out");
 }
 
 void opencl_build_kernel(char *kernel_filename, int sequential_id, char *opts,
diff --git a/src/opencl/argon2d_kernel.cl b/src/opencl/argon2d_kernel.cl
index 9e0a515..b5e2ae6 100755
--- a/src/opencl/argon2d_kernel.cl
+++ b/src/opencl/argon2d_kernel.cl
@@ -62,6 +62,7 @@ static int blake2b_long(uchar *out, const void *in, const uint outlen, const ulo
 		memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
 		out += BLAKE2B_OUTBYTES / 2;
 		toproduce = outlen - BLAKE2B_OUTBYTES / 2;
+#pragma unroll 1
 		while (toproduce > BLAKE2B_OUTBYTES)
 		{
 			memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
@@ -82,17 +83,19 @@ static int blake2b_long(uchar *out, const void *in, const uint outlen, const ulo
 static void ComputeBlock_pgg(ulong2 *state, __global ulong2 *ref_block_ptr, __global ulong2 *next_block_ptr)
 {
 	ulong2 ref_block[64];
-	uchar i;
+	uint i;
 
 	ulong2 t0,t1;
 	uchar16 r16 = (uchar16) (2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
 	uchar16 r24 = (uchar16) (3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
 
+#pragma unroll 1
 	for (i = 0; i < 64; i++)
 	{
 		ref_block[i] = ref_block_ptr[MAP(i)];
 	}
 
+#pragma unroll 1
 	for (i = 0; i < 64; i++)
 	{
 		ref_block[i] = state[i] = state[i] ^ ref_block[i]; //XORing the reference block to the state and storing the copy of the result
@@ -101,62 +104,31 @@ static void ComputeBlock_pgg(ulong2 *state, __global ulong2 *ref_block_ptr, __gl
 
 	// BLAKE2 - begin
 
-	BLAKE2_ROUND_NO_MSG_V(state[0], state[1], state[2], state[3],
-		state[4], state[5], state[6], state[7]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[8], state[9], state[10], state[11],
-		state[12], state[13], state[14], state[15]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[16], state[17], state[18], state[19],
-		state[20], state[21], state[22], state[23]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[24], state[25], state[26], state[27],
-		state[28], state[29], state[30], state[31]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[32], state[33], state[34], state[35],
-		state[36], state[37], state[38], state[39]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[40], state[41], state[42], state[43],
-		state[44], state[45], state[46], state[47]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[48], state[49], state[50], state[51],
-		state[52], state[53], state[54], state[55]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[56], state[57], state[58], state[59],
-		state[60], state[61], state[62], state[63]);
-
-
-	BLAKE2_ROUND_NO_MSG_V(state[0], state[8], state[16], state[24],
-		state[32], state[40], state[48], state[56]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[1], state[9], state[17], state[25],
-		state[33], state[41], state[49], state[57]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[2], state[10], state[18], state[26],
-		state[34], state[42], state[50], state[58]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[3], state[11], state[19], state[27],
-		state[35], state[43], state[51], state[59]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[4], state[12], state[20], state[28],
-		state[36], state[44], state[52], state[60]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[5], state[13], state[21], state[29],
-		state[37], state[45], state[53], state[61]);
-
-	BLAKE2_ROUND_NO_MSG_V(state[6], state[14], state[22], state[30],
-		state[38], state[46], state[54], state[62]);
+#pragma unroll 1
+	for (i = 0; i < 64; i += 8) {
+		BLAKE2_ROUND_NO_MSG_V(state[i], state[i+1],
+		    state[i+2], state[i+3],
+		    state[i+4], state[i+5],
+		    state[i+6], state[i+7]);
+	}
 
-	BLAKE2_ROUND_NO_MSG_V(state[7], state[15], state[23], state[31],
-		state[39], state[47], state[55], state[63]);
+#pragma unroll 1
+	for (i = 0; i < 8; i++) {
+		BLAKE2_ROUND_NO_MSG_V(state[i], state[i+8],
+		    state[i+16], state[i+24],
+		    state[i+32], state[i+40],
+		    state[i+48], state[i+56]);
+	}
 
 	// BLAKE2 - end
 
+#pragma unroll 1
 	for (i = 0; i< 64; i++)
 	{
 		state[i] = state[i] ^ ref_block[i]; //Feedback
 	}
 
+#pragma unroll 1
 	for (i = 0; i< 64; i++)
 	{
 		next_block_ptr[MAP(i)]=state[i];
@@ -173,15 +145,18 @@ static void Initialize(scheme_info_t* info,uchar* input_hash)
 	uint segment_length = (info->mem_size / (SYNC_POINTS*(info->lanes)));
 	memcpy(block_input, input_hash, BLAKE_INPUT_HASH_SIZE);
 	memset(block_input + BLAKE_INPUT_HASH_SIZE, 0, 8);
+#pragma unroll 1
 	for (l = 0; l < info->lanes; ++l)
 	{
 		block_input[BLAKE_INPUT_HASH_SIZE + 4] = l;
 		block_input[BLAKE_INPUT_HASH_SIZE] = 0;
 		blake2b_long((uchar*)out_tmp, block_input, BLOCK_SIZE, BLAKE_INPUT_HASH_SIZE + 8);
+#pragma unroll 1
 		for(i=0;i<BLOCK_SIZE/16;i++)
 			memory[MAP(l * segment_length*BLOCK_SIZE/16+i)]=out_tmp[i];
 		block_input[BLAKE_INPUT_HASH_SIZE] = 1;
 		blake2b_long((uchar*)out_tmp, block_input, BLOCK_SIZE, BLAKE_INPUT_HASH_SIZE + 8);
+#pragma unroll 1
 		for(i=0;i<BLOCK_SIZE/16;i++)
 			memory[MAP((l * segment_length + 1)*BLOCK_SIZE/16+i)]=out_tmp[i];
 	}
@@ -193,15 +168,18 @@ static void Finalize_g(__global ulong2 *state, uchar* out, uint outlen, uchar la
 	uchar l;
 	uint j;
 	ulong2 blockhash[BLOCK_SIZE/sizeof(ulong2)];
+#pragma unroll 1
 	for(j=0;j<BLOCK_SIZE/sizeof(ulong2);j++)
 	{
 		blockhash[j]=0;
 	}
+#pragma unroll 1
 	for (l = 0; l < lanes; ++l)//XORing all last blocks of the lanes
 	{
 		uint segment_length = m_cost / (SYNC_POINTS*lanes);
 		__global ulong2* block_ptr = state + MAP((((SYNC_POINTS - 1)*lanes+l+1)*segment_length-1)*BLOCK_SIZE/16); //points to the last block of the first lane
 
+#pragma unroll 1
 		for (j = 0; j < BLOCK_SIZE / sizeof(ulong2); ++j)
 		{
 			blockhash[j] = blockhash[j]^block_ptr[MAP(j)];
@@ -238,6 +216,7 @@ static void FillSegment(scheme_info_t *info, position_info_t pos)
 			return;
 
 		bi = (pos.lane * segment_length + 1) * BLOCK_SIZE / 16;//<bi> -- temporary variable for loading previous block
+#pragma unroll 1
 		for (i = 0; i < 64; i++)
 		{
 			prev_block[i] = memory[MAP(bi+i)];
@@ -259,6 +238,7 @@ static void FillSegment(scheme_info_t *info, position_info_t pos)
 
 		prev_block_recalc = (pos.slice > 0) ? ((pos.slice - 1)*lanes*segment_length) : (SYNC_POINTS - 2)*lanes*segment_length;
 		bi = ((prev_slice * lanes + pos.lane + 1) * segment_length - 1) * BLOCK_SIZE / 16;//<bi> -- temporary variable for loading previous block
+#pragma unroll 1
 		for (i = 0; i < 64; i++)
 		{
 			prev_block[i] = memory[MAP(bi+i)];
@@ -268,6 +248,7 @@ static void FillSegment(scheme_info_t *info, position_info_t pos)
 	}
 
 	next_block_offset = ((pos.slice*lanes + pos.lane)*segment_length + start)*BLOCK_SIZE;
+#pragma unroll 1
 	for(i = start; i < segment_length; i++)
 	{
 		// Compute block2 index
@@ -322,12 +303,15 @@ static void FillMemory(scheme_info_t* info)//Main loop: filling memory <t_cost>
 	uint p,s,t;
 	position_info_t position;
 	position_info_t_init(&position,0,0,0,0);
+#pragma unroll 1
 	for (p = 0; p < info->passes; p++)
 	{
 		position.pass = p;
+#pragma unroll 1
 		for (s = 0; s < SYNC_POINTS; s++)
 		{
 			position.slice = s;
+#pragma unroll 1
 			for (t = 0; t < info->lanes; t++)
 			{
 				position.lane = t;
@@ -454,10 +438,12 @@ __kernel void argon2d_crypt_kernel(
 	memory+=gid*(((ulong)m_cost)<<10)/sizeof(ulong2);
 
 	//copying password
+#pragma unroll 1
 	for(i=0;i<inlen;i++)
 		passwd[i]=in[i];
 
 	//copying salt
+#pragma unroll 1
 	for(i=0;i<noncelen;i++)
 		nonce[i]=salt->salt[i];
 
diff --git a/src/opencl_blake2.h b/src/opencl_blake2.h
index 2c23238..904b97f 100644
--- a/src/opencl_blake2.h
+++ b/src/opencl_blake2.h
@@ -104,7 +104,8 @@ typedef struct __blake2bp_state {
 //from blake-impl.h
 static inline ulong rotr64(const ulong w, const uint c)
 {
-	return (w >> c) | (w << (64 - c));
+//	return (w >> c) | (w << (64 - c));
+	return rotate(w, (ulong)(64 - c));
 }
 
 __constant ulong blake2b_IV[8] = {

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ