Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 20 Aug 2015 06:30:10 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: PHC: Argon2 on GPU

On Thu, Aug 20, 2015 at 04:53:55AM +0300, Solar Designer wrote:
> On Wed, Aug 19, 2015 at 07:41:02PM +0200, Agnieszka Bielec wrote:
> > ptxas info    : Function properties for FillSegment
> > ptxas         .     0 bytes stack frame, 17400 bytes spill stores,
> > 19352 bytes spill loads
> > ptxas info    : Function properties for GenerateAddresses
> > ptxas         .     0 bytes stack frame, 7780 bytes spill stores,
> > 11648 bytes spill loads
> 
> The spills in FillSegment and GenerateAddresses are pretty bad.  Where
> do they come from, and why so much?  In FillSegment you use 1 KB per
> work-item for addresses[], in GenerateAddresses you use 2 KB for two
> blocks.  GenerateAddresses is called from FillSegment, so adds its
> private memory needs on top of FillSegment's.

There's also 1 KB ref_block[] in ComputeBlock and in ComputeBlock_pgg.

On super's -dev=5, I was getting:

ptxas info    : Function properties for FillSegment
ptxas         .     8216 bytes stack frame, 9708 bytes spill stores, 7776 bytes spill loads
ptxas info    : Function properties for GenerateAddresses
ptxas         .     6104 bytes stack frame, 4056 bytes spill stores, 4124 bytes spill loads

I've optimized this to:

ptxas info    : Function properties for FillSegment
ptxas         .     4408 bytes stack frame, 5984 bytes spill stores, 4020 bytes 
spill loads
ptxas info    : Function properties for GenerateAddresses
ptxas         .     1304 bytes stack frame, 388 bytes spill stores, 400 bytes spill loads

with the attached patch.  As it is, it provides no speedup for me (in
fact, there's very slight slowdown), but it should illustrate to you
what to optimize.  I expect that once you convert those uint operations
to work on ulong2 all the time, you'll see slight speedup.  (The changes
in performance seen from these code changes are relatively minor because
GenerateAddresses corresponds to a relatively small part of the total
running time.  There is a significant reduction in global memory usage,
though, as seen via nvidia-smi.)

In fact, those typecasts between ulong2 and uint pointers are probably
disallowed, as they violate strict aliasing rules.  Also, your code
heavily depends on the architecture being little-endian (just like
Argon2's original code did, which is a known bug).  You should try to
avoid that as you proceed to optimize your OpenCL kernels.  You'll find
that avoiding endianness dependencies goes along with avoiding strict
aliasing violations and achieving better speed as well (since the kernel
would use its full allocated SIMD width all the time, rather than only
part of the time).

BTW, out_tmp[] in Initialize() appears to be twice larger than it needs
to be:

	ulong2 out_tmp[BLOCK_SIZE/8];

ulong2 is 16 bytes, but you divide by 8.  Or is this on purpose?  Why?

Alexander

diff --git a/src/opencl/argon2i_kernel.cl b/src/opencl/argon2i_kernel.cl
index caf9ea0..98ada94 100755
--- a/src/opencl/argon2i_kernel.cl
+++ b/src/opencl/argon2i_kernel.cl
@@ -78,26 +78,12 @@ static int blake2b_long(uchar *out, const void *in, const uint outlen, const ulo
 	return 0;
 }
 
-static void ComputeBlock(ulong2 *state, ulong2 *ref_block_ptr, ulong2 *next_block_ptr)
+static void ComputeBlock(ulong2 *state)
 {
-	ulong2 ref_block[64];
-	uchar i;
-
 	ulong2 t0,t1;
 	uchar16 r16 = (uchar16) (2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
 	uchar16 r24 = (uchar16) (3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
 
-	for (i = 0; i < 64; i++)
-	{
-		ref_block[i] = ref_block_ptr[i];
-	}
-
-	for (i = 0; i < 64; i++)
-	{
-		ref_block[i] = state[i] = state[i] ^ ref_block[i]; //XORing the reference block to the state and storing the copy of the result
-	}
-
-
 	// BLAKE2 - begin
 
 	BLAKE2_ROUND_NO_MSG_V(state[0], state[1], state[2], state[3],
@@ -150,15 +136,8 @@ static void ComputeBlock(ulong2 *state, ulong2 *ref_block_ptr, ulong2 *next_bloc
 		state[39], state[47], state[55], state[63]);
 
 	// BLAKE2 - end
-
-	for (i = 0; i< 64; i++)
-	{
-		state[i] = state[i] ^ ref_block[i]; //Feedback
-		next_block_ptr[i]=state[i];
-	}
 }
 
-
 static void ComputeBlock_pgg(ulong2 *state, __global ulong2 *ref_block_ptr, __global ulong2 *next_block_ptr)
 {
 	ulong2 ref_block[64];
@@ -289,22 +268,30 @@ static void Finalize_g(__global ulong2 *state, uchar* out, uint outlen, uchar la
 static void GenerateAddresses(const scheme_info_t* info, position_info_t* position, uint* addresses)//generate 256 addresses
 {
 	uint i;
-	uchar zero_block[BLOCK_SIZE];
-	uint input_block[BLOCK_SIZE/4];
+	uint prev_block[256];
 	uint segment_length;
 	uint barrier1; //Number of blocks generated in previous slices
 	uint barrier2; //Number of blocks that we can reference in total (including the last blocks of each lane
 	uint start = 0;
-	memset(zero_block, 0,BLOCK_SIZE);
-	memset(input_block, 0, 256 * sizeof(uint));
-	input_block[0] = position->pass;
-	input_block[1] = position->lane;
-	input_block[2] = position->slice;
-	input_block[3] = position->index;
-	input_block[4] = 0xFFFFFFFF;
-	ComputeBlock((ulong2*)input_block, (ulong2*) zero_block, (ulong2*) addresses);
-	ComputeBlock((ulong2*)zero_block, (ulong2*) addresses, (ulong2*) addresses);
 
+	addresses[0] = position->pass;
+	addresses[1] = position->lane;
+	addresses[2] = position->slice;
+	addresses[3] = position->index;
+	addresses[4] = 0xFFFFFFFF;
+	for (i = 5; i < 256; i++)
+		addresses[i] = 0;
+	ComputeBlock((ulong2 *)addresses);
+	addresses[0] ^= position->pass;
+	addresses[1] ^= position->lane;
+	addresses[2] ^= position->slice;
+	addresses[3] ^= position->index;
+	addresses[4] ^= 0xFFFFFFFF;
+	for (i = 0; i < 256; i++)
+		prev_block[i] = addresses[i];
+	ComputeBlock((ulong2 *)addresses);
+	for (i = 0; i < 256; i++)
+		addresses[i] ^= prev_block[i];
 
 	/*Making block offsets*/
 	segment_length = info->mem_size / ((info->lanes)*SYNC_POINTS);

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ