Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed, 2 Sep 2015 18:52:25 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: SHA-1 H()

magnum, Lei -

On Wed, Sep 02, 2015 at 06:20:25PM +0300, Solar Designer wrote:
> SHA-1's H() aka F3() is the same as SHA-2's Maj()

And it turns out that while we appear to be optimally using bitselect()
or vcmov() for Maj(), the fallback expressions that we use vary across
source files and are not always optimal:

[solar@...er src]$ grep -r 'define .*Maj' .
./unused/sha512_kernel.cl:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
./rawSHA512_ng_fmt_plug.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y))
./cuda_cryptsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
./opencl/pwsafe_kernel.cl:#define Maj(x,y,z) (bitselect(y, x,(z^y)))
./opencl/pwsafe_kernel.cl:#define Maj(x, y, z) ((y & z) | (x & (y | z)))
./opencl_sha512.h:      #define Maj(x,y,z)      bitselect(x, y, z ^ x)
./opencl_sha512.h:        #define Maj(x,y,z)      ((x & y) ^ (x & z) ^ (y & z))
./cuda_cryptsha256.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
./escrypt/sha256.c:#define Maj(x, y, z) ((x & (y | z)) | (y & z))
./cuda_xsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))
./opencl_sha2.h:#define Maj(x, y, z)    bitselect(x, y, z ^ x)
./opencl_sha2.h:#define Maj(x, y, z)    ((x & y) | (z & (x | y)))
./rawSHA256_ng_fmt_plug.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y))
./opencl_sha256.h:      #define Maj(x, y, z)    bitselect(x, y, z ^ x)
./opencl_sha256.h:      #define Maj(x, y, z)    ((x & y) ^ (x & z) ^ (y & z))
./simd-intrinsics.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y))
./simd-intrinsics.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y))
./cuda_pwsafe.h:#define Maj(x, y, z) ((y & z) | (x & (y | z)))
./cuda_rawsha256.h:#define Maj(x,y,z) ( (x & y) | (z & (x | y)) )
./cuda_rawsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z))

As you can see, some of these use 5 operations instead of 4, and some
use the parallelism-lacking approach with possibly emulated vcmov().

I think we should standardize on the parallelism-enabled 4 operation
expression for when there's no native bitselect() or vcmov() - for both
SHA-1 and SHA-2 in the same way.

This means we should probably check for AMD or at least NVIDIA Maxwell,
falling back to the 4 operation expression on older NVIDIA GPUs.  (Need
to check which expression gets compiled to LOP3.LUT on Maxwell, though.)

A curious aspect is that Maj() is invariant with respect to the ordering
of its arguments.  We can see it in the grep output above: some of the
expressions are the same except that they have x, y, z re-ordered in
different ways.  We could test all 6 possible orderings in different
contexts (SHA-1 vs. SHA-256 vs. SHA-512, and different OpenCL kernels,
etc.) and see which is faster where (this might in fact differ).

Attached to this message is a program I used to search for possible
optimized expressions like this.  No new findings from it, but it did
remind me of the issues I described in these two messages.  I was hoping
it might find a 2 operation expression for MD5's I(), but no luck.
It doesn't yet test two bitselect()'s per expression, though - this is
worth adding and trying again (many possibilities to test there).

Oh, and of course on Maxwell and AVX-512 MD5's I() (and all others) is
just one operation, if the compiler manages.  We should check the
generated code for our kernels on Maxwell.

Lei - I think you haven't gotten around to introducing AVX-512 ternary
logic intrinsics yet, have you?  Unfortunately, we don't yet have
hardware to test them on, but you could test on Intel SDE or/and by
emulating them with macros.

Alexander

#include <stdio.h>

#define Ch(x, y, z) ((x & y) ^ (~x & z))
#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))

//#define f(x, y, z) ((y) ^ ((x) | ~(z)))
#define f(x, y, z) ((x & y) | (z & (x | y)))
//#define f(x, y, z) Maj(x, y, z)

#define sel(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))

int main(void)
{
	unsigned int x, y, z;
	unsigned int i, j, k;

	for (k = 0; k <= 25; k++)
	for (j = 0; j < 6; j++)
	for (i = 0; i < 4; i++) {
		unsigned int ok = 0;

		for (x = 0; x < 2; x++)
		for (y = 0; y < 2; y++)
		for (z = 0; z < 2; z++) {
			unsigned int f1 = f(x, y, z);
			unsigned int a[4] = {0, x, y, z};
			switch (j) {
				case 1:
					a[1] = x;
					a[2] = z;
					a[3] = y;
					break;
				case 2:
					a[1] = y;
					a[2] = x;
					a[3] = z;
					break;
				case 3:
					a[1] = y;
					a[2] = z;
					a[3] = x;
					break;
				case 4:
					a[1] = z;
					a[2] = x;
					a[3] = y;
					break;
				case 5:
					a[1] = z;
					a[2] = y;
					a[3] = x;
					break;
			}
			unsigned int f2 = sel(a[1], a[2], a[3]);
			a[0] = f2;
			if (k == 25) {
				if (f2 == f1) ok++;
				continue;
			}
			unsigned int *which = &a[i];
			switch (k) {
				case 0:
					*which = ~*which;
					break;
				case 1:
				case 2:
				case 3:
					if (which == &a[k]) continue;
					*which ^= a[k];
					break;
				case 4:
				case 5:
				case 6:
					if (which == &a[k - 3]) continue;
					*which |= a[k - 3];
					break;
				case 7:
				case 8:
				case 9:
					if (which == &a[k - 6]) continue;
					*which &= a[k - 6];
					break;
				case 10:
				case 11:
				case 12:
					if (which == &a[k - 9]) continue;
					*which ^= ~a[k - 9];
					break;
				case 13:
				case 14:
				case 15:
					if (which == &a[k - 12]) continue;
					*which |= ~a[k - 12];
					break;
				case 16:
				case 17:
				case 18:
					if (which == &a[k - 15]) continue;
					*which = ~*which | a[k - 15];
					break;
				case 19:
				case 20:
				case 21:
					if (which == &a[k - 18]) continue;
					*which &= ~a[k - 18];
					break;
				case 22:
				case 23:
				case 24:
					if (which == &a[k - 21]) continue;
					*which = ~*which & a[k - 21];
					break;
			}
			if (which != &a[0])
				f2 = sel(a[1], a[2], a[3]);
			if (f2 == f1) ok++;
		}

		if (ok == 8)
			printf("k = %u j = %u i = %u\n", k, j, i);
	}

	return 0;
}

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ