
Date: Wed, 2 Sep 2015 18:52:25 +0300 From: Solar Designer <solar@...nwall.com> To: johndev@...ts.openwall.com Subject: Re: SHA1 H() magnum, Lei  On Wed, Sep 02, 2015 at 06:20:25PM +0300, Solar Designer wrote: > SHA1's H() aka F3() is the same as SHA2's Maj() And it turns out that while we appear to be optimally using bitselect() or vcmov() for Maj(), the fallback expressions that we use vary across source files and are not always optimal: [solar@...er src]$ grep r 'define .*Maj' . ./unused/sha512_kernel.cl:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) ./rawSHA512_ng_fmt_plug.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y)) ./cuda_cryptsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) ./opencl/pwsafe_kernel.cl:#define Maj(x,y,z) (bitselect(y, x,(z^y))) ./opencl/pwsafe_kernel.cl:#define Maj(x, y, z) ((y & z)  (x & (y  z))) ./opencl_sha512.h: #define Maj(x,y,z) bitselect(x, y, z ^ x) ./opencl_sha512.h: #define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) ./cuda_cryptsha256.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) ./escrypt/sha256.c:#define Maj(x, y, z) ((x & (y  z))  (y & z)) ./cuda_xsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) ./opencl_sha2.h:#define Maj(x, y, z) bitselect(x, y, z ^ x) ./opencl_sha2.h:#define Maj(x, y, z) ((x & y)  (z & (x  y))) ./rawSHA256_ng_fmt_plug.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y)) ./opencl_sha256.h: #define Maj(x, y, z) bitselect(x, y, z ^ x) ./opencl_sha256.h: #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) ./simdintrinsics.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y)) ./simdintrinsics.c:#define Maj(x,y,z) vcmov(x, y, vxor(z, y)) ./cuda_pwsafe.h:#define Maj(x, y, z) ((y & z)  (x & (y  z))) ./cuda_rawsha256.h:#define Maj(x,y,z) ( (x & y)  (z & (x  y)) ) ./cuda_rawsha512.h:#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) As you can see, some of these use 5 operations instead of 4, and some use the parallelismlacking approach with possibly emulated vcmov(). I think we should standardize on the parallelismenabled 4 operation expression for when there's no native bitselect() or vcmov()  for both SHA1 and SHA2 in the same way. This means we should probably check for AMD or at least NVIDIA Maxwell, falling back to the 4 operation expression on older NVIDIA GPUs. (Need to check which expression gets compiled to LOP3.LUT on Maxwell, though.) A curious aspect is that Maj() is invariant with respect to the ordering of its arguments. We can see it in the grep output above: some of the expressions are the same except that they have x, y, z reordered in different ways. We could test all 6 possible orderings in different contexts (SHA1 vs. SHA256 vs. SHA512, and different OpenCL kernels, etc.) and see which is faster where (this might in fact differ). Attached to this message is a program I used to search for possible optimized expressions like this. No new findings from it, but it did remind me of the issues I described in these two messages. I was hoping it might find a 2 operation expression for MD5's I(), but no luck. It doesn't yet test two bitselect()'s per expression, though  this is worth adding and trying again (many possibilities to test there). Oh, and of course on Maxwell and AVX512 MD5's I() (and all others) is just one operation, if the compiler manages. We should check the generated code for our kernels on Maxwell. Lei  I think you haven't gotten around to introducing AVX512 ternary logic intrinsics yet, have you? Unfortunately, we don't yet have hardware to test them on, but you could test on Intel SDE or/and by emulating them with macros. Alexander #include <stdio.h> #define Ch(x, y, z) ((x & y) ^ (~x & z)) #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) //#define f(x, y, z) ((y) ^ ((x)  ~(z))) #define f(x, y, z) ((x & y)  (z & (x  y))) //#define f(x, y, z) Maj(x, y, z) #define sel(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) int main(void) { unsigned int x, y, z; unsigned int i, j, k; for (k = 0; k <= 25; k++) for (j = 0; j < 6; j++) for (i = 0; i < 4; i++) { unsigned int ok = 0; for (x = 0; x < 2; x++) for (y = 0; y < 2; y++) for (z = 0; z < 2; z++) { unsigned int f1 = f(x, y, z); unsigned int a[4] = {0, x, y, z}; switch (j) { case 1: a[1] = x; a[2] = z; a[3] = y; break; case 2: a[1] = y; a[2] = x; a[3] = z; break; case 3: a[1] = y; a[2] = z; a[3] = x; break; case 4: a[1] = z; a[2] = x; a[3] = y; break; case 5: a[1] = z; a[2] = y; a[3] = x; break; } unsigned int f2 = sel(a[1], a[2], a[3]); a[0] = f2; if (k == 25) { if (f2 == f1) ok++; continue; } unsigned int *which = &a[i]; switch (k) { case 0: *which = ~*which; break; case 1: case 2: case 3: if (which == &a[k]) continue; *which ^= a[k]; break; case 4: case 5: case 6: if (which == &a[k  3]) continue; *which = a[k  3]; break; case 7: case 8: case 9: if (which == &a[k  6]) continue; *which &= a[k  6]; break; case 10: case 11: case 12: if (which == &a[k  9]) continue; *which ^= ~a[k  9]; break; case 13: case 14: case 15: if (which == &a[k  12]) continue; *which = ~a[k  12]; break; case 16: case 17: case 18: if (which == &a[k  15]) continue; *which = ~*which  a[k  15]; break; case 19: case 20: case 21: if (which == &a[k  18]) continue; *which &= ~a[k  18]; break; case 22: case 23: case 24: if (which == &a[k  21]) continue; *which = ~*which & a[k  21]; break; } if (which != &a[0]) f2 = sel(a[1], a[2], a[3]); if (f2 == f1) ok++; } if (ok == 8) printf("k = %u j = %u i = %u\n", k, j, i); } return 0; }
Powered by blists  more mailing lists