// Proof-of-concept bitslice implementation of the SHA256 compression function. // // Written by Alain Espinosa in 2015, and placed in the // public domain. There's absolutely no warranty. Although this is not required // formally, due credit will be appreciated if you re-use this code or concepts. // // Clarity was the first goal, some easy optimizations remains. // Provided code for AVX2, SSE2 and 64-bits versions. // // Some ideas taken from Solar Designer "md5slice.c" implementation. // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Benchmark configuration: Windows 8.1, Visual Studio 2013, Core i5-4670 3.4GHz, only one thread // AVX2 : 12.5 millions keys per second // SSE2 : 6.92 millions keys per second // 64-bits: 3.47 millions keys per second // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Analysis of implementation differences between bitslice and normal SHA256. // // The main difference between the two are related to how we handle SUMS and ROTATE/SHIFTS. // - Normal SHA256 spend 3 instructions to rotate, 1 to shift and 1 to sum. // - Bitslice SHA256 had free rotate/shifts and 5 instructions to sum. // // Lets get some performance approximation: calculate the instructions needed for one SHA256 step //---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- // W[0] += R1(W[14]) + W[9] + R0(W[1]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0xD807AA98 + W[0]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); //---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- // - Normal : 1 3+3+1+2 1 1 3+3+1+2 1 3*3+2 1 1 1 1 1 1 1 1 3*3+2 1 1 1 1 1 = 57 instructions // - BS : 5 2 5 5 2 5 2 5 1 1 1 5 5 5 5 2 5 1 1 1 1 = 65 instructions // - Loads : 1 1 1 1 (We can maintain A, B, C, D, E, F, G, H in registers all the time) = 4 loads + 1 store (W[0]) // - BS Loads: 1 1 1 1 1 1 1 1 1 1 1 1 = 12 loads + 2 store (W[0] and H) // // So we can expect that bitslice SHA256 will be (79-62)/62 = 27% slower than normal SHA256 #include #include #include #include #define PRIVATE static #ifdef _WIN32 #define rotate32(x,shift) _rotl(x,shift) #else #define rotate32(x,shift) (((x)>>(32-(n)))|((x)<<(n))) #endif ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Baseline SHA256 implementation ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #define INIT_A 0x6a09e667 #define INIT_B 0xbb67ae85 #define INIT_C 0x3c6ef372 #define INIT_D 0xa54ff53a #define INIT_E 0x510e527f #define INIT_F 0x9b05688c #define INIT_G 0x1f83d9ab #define INIT_H 0x5be0cd19 #define R_E(x) (rotate32(x,26) ^ rotate32(x,21) ^ rotate32(x,7 )) #define R_A(x) (rotate32(x,30) ^ rotate32(x,19) ^ rotate32(x,10)) #define R0(x) (rotate32(x,25) ^ rotate32(x,14) ^ (x>>3)) #define R1(x) (rotate32(x,15) ^ rotate32(x,13) ^ (x>>10)) PRIVATE void sha256_process_block(uint32_t state[8], uint32_t W[16]) { uint32_t A = state[0]; uint32_t B = state[1]; uint32_t C = state[2]; uint32_t D = state[3]; uint32_t E = state[4]; uint32_t F = state[5]; uint32_t G = state[6]; uint32_t H = state[7]; /* Round 1 */ H += R_E(E) + (G ^ (E & (F ^ G))) + 0x428A2F98 + W[ 0]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); G += R_E(D) + (F ^ (D & (E ^ F))) + 0x71374491 + W[ 1]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); F += R_E(C) + (E ^ (C & (D ^ E))) + 0xB5C0FBCF + W[ 2]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); E += R_E(B) + (D ^ (B & (C ^ D))) + 0xE9B5DBA5 + W[ 3]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x3956C25B + W[ 4]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); C += R_E(H) + (B ^ (H & (A ^ B))) + 0x59F111F1 + W[ 5]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x923F82A4 + W[ 6]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); A += R_E(F) + (H ^ (F & (G ^ H))) + 0xAB1C5ED5 + W[ 7]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); H += R_E(E) + (G ^ (E & (F ^ G))) + 0xD807AA98 + W[ 8]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); G += R_E(D) + (F ^ (D & (E ^ F))) + 0x12835B01 + W[ 9]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); F += R_E(C) + (E ^ (C & (D ^ E))) + 0x243185BE + W[10]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); E += R_E(B) + (D ^ (B & (C ^ D))) + 0x550C7DC3 + W[11]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x72BE5D74 + W[12]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); C += R_E(H) + (B ^ (H & (A ^ B))) + 0x80DEB1FE + W[13]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x9BDC06A7 + W[14]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); A += R_E(F) + (H ^ (F & (G ^ H))) + 0xC19BF174 + W[15]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); /* Round 2 */ W[ 0] += R1(W[14]) + W[9 ] + R0(W[1 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0xE49B69C1 + W[ 0]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 1] += R1(W[15]) + W[10] + R0(W[2 ]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0xEFBE4786 + W[ 1]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[ 2] += R1(W[0 ]) + W[11] + R0(W[3 ]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0x0FC19DC6 + W[ 2]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[ 3] += R1(W[1 ]) + W[12] + R0(W[4 ]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0x240CA1CC + W[ 3]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[ 4] += R1(W[2 ]) + W[13] + R0(W[5 ]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x2DE92C6F + W[ 4]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[ 5] += R1(W[3 ]) + W[14] + R0(W[6 ]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0x4A7484AA + W[ 5]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[ 6] += R1(W[4 ]) + W[15] + R0(W[7 ]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x5CB0A9DC + W[ 6]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[ 7] += R1(W[5 ]) + W[0 ] + R0(W[8 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0x76F988DA + W[ 7]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); W[ 8] += R1(W[6 ]) + W[1 ] + R0(W[9 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0x983E5152 + W[ 8]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 9] += R1(W[7 ]) + W[2 ] + R0(W[10]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0xA831C66D + W[ 9]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[10] += R1(W[8 ]) + W[3 ] + R0(W[11]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0xB00327C8 + W[10]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[11] += R1(W[9 ]) + W[4 ] + R0(W[12]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0xBF597FC7 + W[11]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[12] += R1(W[10]) + W[5 ] + R0(W[13]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0xC6E00BF3 + W[12]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[13] += R1(W[11]) + W[6 ] + R0(W[14]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0xD5A79147 + W[13]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[14] += R1(W[12]) + W[7 ] + R0(W[15]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x06CA6351 + W[14]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[15] += R1(W[13]) + W[8 ] + R0(W[0 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0x14292967 + W[15]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); /* Round 3 */ W[ 0] += R1(W[14]) + W[9 ] + R0(W[1 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0x27B70A85 + W[ 0]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 1] += R1(W[15]) + W[10] + R0(W[2 ]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0x2E1B2138 + W[ 1]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[ 2] += R1(W[0 ]) + W[11] + R0(W[3 ]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0x4D2C6DFC + W[ 2]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[ 3] += R1(W[1 ]) + W[12] + R0(W[4 ]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0x53380D13 + W[ 3]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[ 4] += R1(W[2 ]) + W[13] + R0(W[5 ]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x650A7354 + W[ 4]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[ 5] += R1(W[3 ]) + W[14] + R0(W[6 ]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0x766A0ABB + W[ 5]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[ 6] += R1(W[4 ]) + W[15] + R0(W[7 ]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x81C2C92E + W[ 6]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[ 7] += R1(W[5 ]) + W[0 ] + R0(W[8 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0x92722C85 + W[ 7]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); W[ 8] += R1(W[6 ]) + W[1 ] + R0(W[9 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0xA2BFE8A1 + W[ 8]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 9] += R1(W[7 ]) + W[2 ] + R0(W[10]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0xA81A664B + W[ 9]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[10] += R1(W[8 ]) + W[3 ] + R0(W[11]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0xC24B8B70 + W[10]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[11] += R1(W[9 ]) + W[4 ] + R0(W[12]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0xC76C51A3 + W[11]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[12] += R1(W[10]) + W[5 ] + R0(W[13]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0xD192E819 + W[12]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[13] += R1(W[11]) + W[6 ] + R0(W[14]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0xD6990624 + W[13]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[14] += R1(W[12]) + W[7 ] + R0(W[15]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0xF40E3585 + W[14]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[15] += R1(W[13]) + W[8 ] + R0(W[0 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0x106AA070 + W[15]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); /* Round 4 */ W[ 0] += R1(W[14]) + W[9 ] + R0(W[1 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0x19A4C116 + W[ 0]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 1] += R1(W[15]) + W[10] + R0(W[2 ]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0x1E376C08 + W[ 1]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[ 2] += R1(W[0 ]) + W[11] + R0(W[3 ]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0x2748774C + W[ 2]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[ 3] += R1(W[1 ]) + W[12] + R0(W[4 ]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0x34B0BCB5 + W[ 3]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[ 4] += R1(W[2 ]) + W[13] + R0(W[5 ]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x391C0CB3 + W[ 4]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[ 5] += R1(W[3 ]) + W[14] + R0(W[6 ]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0x4ED8AA4A + W[ 5]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[ 6] += R1(W[4 ]) + W[15] + R0(W[7 ]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0x5B9CCA4F + W[ 6]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[ 7] += R1(W[5 ]) + W[0 ] + R0(W[8 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0x682E6FF3 + W[ 7]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); W[ 8] += R1(W[6 ]) + W[1 ] + R0(W[9 ]); H += R_E(E) + (G ^ (E & (F ^ G))) + 0x748F82EE + W[ 8]; D += H; H += R_A(A) + ((A & B) | (C & (A | B))); W[ 9] += R1(W[7 ]) + W[2 ] + R0(W[10]); G += R_E(D) + (F ^ (D & (E ^ F))) + 0x78A5636F + W[ 9]; C += G; G += R_A(H) + ((H & A) | (B & (H | A))); W[10] += R1(W[8 ]) + W[3 ] + R0(W[11]); F += R_E(C) + (E ^ (C & (D ^ E))) + 0x84C87814 + W[10]; B += F; F += R_A(G) + ((G & H) | (A & (G | H))); W[11] += R1(W[9 ]) + W[4 ] + R0(W[12]); E += R_E(B) + (D ^ (B & (C ^ D))) + 0x8CC70208 + W[11]; A += E; E += R_A(F) + ((F & G) | (H & (F | G))); W[12] += R1(W[10]) + W[5 ] + R0(W[13]); D += R_E(A) + (C ^ (A & (B ^ C))) + 0x90BEFFFA + W[12]; H += D; D += R_A(E) + ((E & F) | (G & (E | F))); W[13] += R1(W[11]) + W[6 ] + R0(W[14]); C += R_E(H) + (B ^ (H & (A ^ B))) + 0xA4506CEB + W[13]; G += C; C += R_A(D) + ((D & E) | (F & (D | E))); W[14] += R1(W[12]) + W[7 ] + R0(W[15]); B += R_E(G) + (A ^ (G & (H ^ A))) + 0xBEF9A3F7 + W[14]; F += B; B += R_A(C) + ((C & D) | (E & (C | D))); W[15] += R1(W[13]) + W[8 ] + R0(W[0 ]); A += R_E(F) + (H ^ (F & (G ^ H))) + 0xC67178F2 + W[15]; E += A; A += R_A(B) + ((B & C) | (D & (B | C))); state[0] += A; state[1] += B; state[2] += C; state[3] += D; state[4] += E; state[5] += F; state[6] += G; state[7] += H; } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // AVX2 intrinsics ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #include #define VECTOR_WORD __m256i #define VECTOR_XOR(a,b) _mm256_xor_si256(a,b) #define VECTOR_AND(a,b) _mm256_and_si256(a,b) #define VECTOR_OR(a,b) _mm256_or_si256(a,b) #define VECTOR_ZERO _mm256_setzero_si256() #define VECTOR_CONST(u32_const) _mm256_broadcastd_epi32(_mm_set1_epi32(u32_const)) #define VECTOR_SR(a,shift) _mm256_srli_epi32(a,shift) #define VECTOR_SL(a,shift) _mm256_slli_epi32(a,shift) ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // SSE2 intrinsics ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //#include // //#define VECTOR_WORD __m128i //#define VECTOR_XOR(a,b) _mm_xor_si128(a,b) //#define VECTOR_AND(a,b) _mm_and_si128(a,b) //#define VECTOR_OR(a,b) _mm_or_si128(a,b) //#define VECTOR_ZERO _mm_setzero_si128() //#define VECTOR_CONST(u32_const) _mm_set1_epi32(u32_const) //#define VECTOR_SR(a,shift) _mm_srli_epi32(a,shift) //#define VECTOR_SL(a,shift) _mm_slli_epi32(a,shift) ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // 64-bits intrinsics ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //#define VECTOR_WORD uint64_t //#define VECTOR_XOR(a,b) ((a) ^ (b)) //#define VECTOR_AND(a,b) ((a) & (b)) //#define VECTOR_OR(a,b) ((a) | (b)) //#define VECTOR_ZERO 0 //#define VECTOR_CONST(u32_const) (u32_const | (((uint64_t)u32_const)<<32)) //#define VECTOR_SR(a,shift) ((a) >> (shift)) //#define VECTOR_SL(a,shift) ((a) << (shift)) // Common definitions #define VECTOR_NUM_KEYS sizeof(VECTOR_WORD)*8 // The numer of keys tried in every bitslice SHA256 compress call #define VECTOR_3XOR(a,b,c) VECTOR_XOR(VECTOR_XOR(a,b),c) #define BS_SHA256_UNROLL // Convert a constant to bitslice representation PRIVATE void bs_const32(VECTOR_WORD bs_value[32], uint32_t value) { VECTOR_WORD ones = VECTOR_CONST(UINT32_MAX); VECTOR_WORD zero = VECTOR_ZERO; for (uint32_t i = 0; i < 32; i++, value >>= 1u) bs_value[i] = (value & 1u) ? ones : zero; } // Sum two values in bitslice representation: result=x+y PRIVATE void bs_add32(VECTOR_WORD result[32], VECTOR_WORD x[32], VECTOR_WORD y[32]) { VECTOR_WORD carries = VECTOR_ZERO; for (uint32_t i = 0; i < 32; i++) { VECTOR_WORD a = x[i]; VECTOR_WORD b = y[i]; VECTOR_WORD p = VECTOR_XOR(a, b); result[i] = VECTOR_XOR(p, carries); carries = VECTOR_OR(VECTOR_AND(p, carries), VECTOR_AND(a, b)); } } #define BS_SHA256_STEP_UNROLL(i) \ /*Calculate BITSELECT*/\ sum1 = VECTOR_XOR(F[i], G[i]);\ sum1 = VECTOR_AND(sum1, E[i]);\ sum1 = VECTOR_XOR(sum1, G[i]);\ /*Sum BITSELECT*/\ sum0 = H[i];\ p = VECTOR_XOR(sum0, sum1);\ reg_H = VECTOR_XOR(p, carries_bs);\ carries_bs = VECTOR_OR(VECTOR_AND(p, carries_bs), VECTOR_AND(sum0, sum1));\ /*Sum RE*/\ sum0 = VECTOR_3XOR(E[(32 - 26 + i) & 31], E[(32 - 21 + i) & 31], E[(32 - 7 + i) & 31]);\ p = VECTOR_XOR(sum0, reg_H);\ sum0 = VECTOR_AND(sum0, reg_H);\ reg_H = VECTOR_XOR(p, carries_RE);\ carries_RE = VECTOR_OR(VECTOR_AND(p, carries_RE), sum0);\ /*Sum CONST*/\ sum0 = (const_step & (1u<>= 1, m ^= m << i) for (k = 0; k < 32; k = (k + i + 1) & ~i) { VECTOR_WORD tmp = VECTOR_AND(VECTOR_XOR(bs_W[j][k + i], VECTOR_SR(bs_W[j][k], i)), VECTOR_CONST(m)); bs_W[j][k + i] = VECTOR_XOR(bs_W[j][k + i], tmp); bs_W[j][k] = VECTOR_XOR(bs_W[j][k], VECTOR_SL(tmp, i)); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Bitslice SHA256 invocation //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Init bitslice SHA256 VECTOR_WORD bs_state[8][32]; bs_const32(bs_state[0], INIT_A); bs_const32(bs_state[1], INIT_B); bs_const32(bs_state[2], INIT_C); bs_const32(bs_state[3], INIT_D); bs_const32(bs_state[4], INIT_E); bs_const32(bs_state[5], INIT_F); bs_const32(bs_state[6], INIT_G); bs_const32(bs_state[7], INIT_H); // Perform compresss function bs_sha256_process_block(bs_state, bs_W); // Transform from bitslice representation to a direct usable one for (uint32_t j = 0; j < 8; j++) { // Transpose 32x32 bit matrix uint32_t m = 0x0000ffff, i, k; for (i = 16; i != 0; i >>= 1, m ^= m << i) for (k = 0; k < 32; k = (k + i + 1) & ~i) { VECTOR_WORD tmp = VECTOR_AND(VECTOR_XOR(bs_state[j][k + i], VECTOR_SR(bs_state[j][k], i)), VECTOR_CONST(m)); bs_state[j][k + i] = VECTOR_XOR(bs_state[j][k + i], tmp); bs_state[j][k] = VECTOR_XOR(bs_state[j][k], VECTOR_SL(tmp, i)); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Test that bitslice code works without problems //////////////////////////////////////////////////////////////////////////////////////////////////////////////// for (uint32_t i = 0; i < VECTOR_NUM_KEYS; i++) { // Init normal SHA256 uint32_t state[8]; state[0] = INIT_A; state[1] = INIT_B; state[2] = INIT_C; state[3] = INIT_D; state[4] = INIT_E; state[5] = INIT_F; state[6] = INIT_G; state[7] = INIT_H; // Perform compress function sha256_process_block(state, W[i]); // Compare normal and bitslice results for (uint32_t j = 0; j < 8; j++) if (state[j] != ((uint32_t*)(bs_state[j]))[(i & 31)*(sizeof(VECTOR_WORD) / 4) + i / 32]) { had_bs_error = 1; printf("Bitslice SHA256 implementation had a programming error\n"); } } if (!had_bs_error) printf("Bitslice algorithm executes successfully\n\n"); //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Perform a minimal benchmark //////////////////////////////////////////////////////////////////////////////////////////////////////////////// printf("//////////////////////////////////////////////\n"); printf("Benchmarking bitslice SHA256 compress function\n"); printf("//////////////////////////////////////////////\n"); clock_t bs_init_time = clock(); for (uint32_t i = 0; i < MAX_NUM_REPETITIONS; i++) bs_sha256_process_block(bs_state, bs_W); uint64_t duration = clock() - bs_init_time; uint64_t keys_proccessed_per_sec = MAX_NUM_REPETITIONS*VECTOR_NUM_KEYS*CLOCKS_PER_SEC / duration; printf("Benchmark duration: %u ms\n", duration*1000/CLOCKS_PER_SEC); printf("Performance: %llu keys per second\n", keys_proccessed_per_sec); // Wait for one keystroke char c; scanf("%c", &c); }