diff -urp escrypt-24/crypto_scrypt-nosse.c escrypt-26/crypto_scrypt-nosse.c --- escrypt-24/crypto_scrypt-nosse.c 2013-03-17 03:07:41 +0000 +++ escrypt-26/crypto_scrypt-nosse.c 2013-03-17 20:24:44 +0000 @@ -209,8 +209,10 @@ smix(uint8_t * B, size_t r, uint64_t N, /* 8: X <-- H(X \xor V_j) */ blkxor(X, &V[j * (32 * r)], 128 * r); blockmix_salsa8(X, Y, Z, r); - if (defeat_tmto) - blkcpy(&V[(~j & (N - 1)) * (32 * r)], Y, 128 * r); + if (defeat_tmto) { + blkxor(Y, &V[(j ^ 1) * (32 * r)], 128 * r); + blkcpy(&V[(j ^ 1) * (32 * r)], Y, 128 * r); + } /* 7: j <-- Integerify(X) mod N */ j = integerify(Y, r) & (N - 1); @@ -218,8 +220,10 @@ smix(uint8_t * B, size_t r, uint64_t N, /* 8: X <-- H(X \xor V_j) */ blkxor(Y, &V[j * (32 * r)], 128 * r); blockmix_salsa8(Y, X, Z, r); - if (defeat_tmto) - blkcpy(&V[(~j & (N - 1)) * (32 * r)], X, 128 * r); + if (defeat_tmto) { + blkxor(X, &V[(j ^ 1) * (32 * r)], 128 * r); + blkcpy(&V[(j ^ 1) * (32 * r)], X, 128 * r); + } } /* 10: B' <-- X */ @@ -259,7 +263,7 @@ crypto_escrypt(const uint8_t * passwd, s errno = EFBIG; goto err0; } - if (((N & (N - 1)) != 0) || (N == 0)) { + if (((N & (N - 1)) != 0) || (N < 2)) { errno = EINVAL; goto err0; } diff -urp escrypt-24/crypto_scrypt-ref.c escrypt-26/crypto_scrypt-ref.c --- escrypt-24/crypto_scrypt-ref.c 2013-03-17 03:06:17 +0000 +++ escrypt-26/crypto_scrypt-ref.c 2013-03-17 20:20:55 +0000 @@ -199,8 +199,10 @@ smix(uint8_t * B, size_t r, uint64_t N, /* 8: X <-- H(X \xor V_j) */ blkxor(X, &V[j * (128 * r)], 128 * r); blockmix_salsa8(X, Y, r); - if (defeat_tmto) - blkcpy(&V[(~j & (N - 1)) * (128 * r)], X, 128 * r); + if (defeat_tmto) { + blkxor(X, &V[(j ^ 1) * (128 * r)], 128 * r); + blkcpy(&V[(j ^ 1) * (128 * r)], X, 128 * r); + } } /* 10: B' <-- X */ @@ -238,7 +240,7 @@ crypto_escrypt(const uint8_t * passwd, s errno = EFBIG; goto err0; } - if (((N & (N - 1)) != 0) || (N == 0)) { + if (((N & (N - 1)) != 0) || (N < 2)) { errno = EINVAL; goto err0; } diff -urp escrypt-24/crypto_scrypt-sse.c escrypt-26/crypto_scrypt-sse.c --- escrypt-24/crypto_scrypt-sse.c 2013-03-17 03:04:40 +0000 +++ escrypt-26/crypto_scrypt-sse.c 2013-03-17 21:01:56 +0000 @@ -85,7 +85,7 @@ /** * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ in. */ -#define SALSA20_8_XOR(in, out) \ +#define SALSA20_8_XOR(in) \ { \ __m128i Y0 = X0 = _mm_xor_si128(X0, (in)[0]); \ __m128i Y1 = X1 = _mm_xor_si128(X1, (in)[1]); \ @@ -95,12 +95,18 @@ SALSA20_2ROUNDS \ SALSA20_2ROUNDS \ SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ + X0 = _mm_add_epi32(X0, Y0); \ + X1 = _mm_add_epi32(X1, Y1); \ + X2 = _mm_add_epi32(X2, Y2); \ + X3 = _mm_add_epi32(X3, Y3); \ } +#define OUT(out) \ + (out)[0] = X0; \ + (out)[1] = X1; \ + (out)[2] = X2; \ + (out)[3] = X3; + /** * blockmix_salsa8(Bin, Bout, r): * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r @@ -121,7 +127,8 @@ blockmix_salsa8(__m128i * Bin, __m128i * /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR(Bin, Bout) + SALSA20_8_XOR(Bin) + OUT(Bout) /* 2: for i = 0 to 2r - 1 do */ r--; @@ -129,20 +136,23 @@ blockmix_salsa8(__m128i * Bin, __m128i * /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR(&Bin[i * 8 + 4], &Bout[(r + i) * 4 + 4]) + SALSA20_8_XOR(&Bin[i * 8 + 4]) + OUT(&Bout[(r + i) * 4 + 4]) i++; /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR(&Bin[i * 8], &Bout[i * 4]) + SALSA20_8_XOR(&Bin[i * 8]) + OUT(&Bout[i * 4]) } /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR(&Bin[i * 8 + 4], &Bout[(r + i) * 4 + 4]) + SALSA20_8_XOR(&Bin[i * 8 + 4]) + OUT(&Bout[(r + i) * 4 + 4]) } #define XOR4(in) \ @@ -151,17 +161,32 @@ blockmix_salsa8(__m128i * Bin, __m128i * X2 = _mm_xor_si128(X2, (in)[2]); \ X3 = _mm_xor_si128(X3, (in)[3]); -#define CONDOUT(do_out, out) \ - if (do_out) { \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; \ +#define CONDOUT(out, do_xor, xor) \ + if (do_xor) { \ + (xor)[0] = (out)[0] = _mm_xor_si128(X0, (xor)[0]); \ + (xor)[1] = (out)[1] = _mm_xor_si128(X1, (xor)[1]); \ + (xor)[2] = (out)[2] = _mm_xor_si128(X2, (xor)[2]); \ + (xor)[3] = (out)[3] = _mm_xor_si128(X3, (xor)[3]); \ + } else { \ + OUT(out) \ + } + +#define CONDOUT_RET(out, do_xor, xor) \ + if (do_xor) { \ + uint32_t ret = _mm_cvtsi128_si32( \ + (xor)[0] = (out)[0] = _mm_xor_si128(X0, (xor)[0])); \ + (xor)[1] = (out)[1] = _mm_xor_si128(X1, (xor)[1]); \ + (xor)[2] = (out)[2] = _mm_xor_si128(X2, (xor)[2]); \ + (xor)[3] = (out)[3] = _mm_xor_si128(X3, (xor)[3]); \ + return ret; \ + } else { \ + OUT(out) \ + return _mm_cvtsi128_si32(X0); \ } static inline uint32_t blockmix_salsa8_xor(__m128i * Bin1, __m128i * Bin2, __m128i * Bout, - __m128i * Bdup, size_t r) + __m128i * Bxor, size_t r) { __m128i X0, X1, X2, X3; size_t i; @@ -176,8 +201,8 @@ blockmix_salsa8_xor(__m128i * Bin1, __m1 /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ XOR4(Bin1) - SALSA20_8_XOR(Bin2, Bout) - CONDOUT(Bdup, Bdup) + SALSA20_8_XOR(Bin2) + CONDOUT(Bout, Bxor, Bxor) /* 2: for i = 0 to 2r - 1 do */ r--; @@ -186,8 +211,8 @@ blockmix_salsa8_xor(__m128i * Bin1, __m1 /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ XOR4(&Bin1[i * 8 + 4]) - SALSA20_8_XOR(&Bin2[i * 8 + 4], &Bout[(r + i) * 4 + 4]) - CONDOUT(Bdup, &Bdup[(r + i) * 4 + 4]) + SALSA20_8_XOR(&Bin2[i * 8 + 4]) + CONDOUT(&Bout[(r + i) * 4 + 4], Bxor, &Bxor[(r + i) * 4 + 4]) i++; @@ -195,25 +220,25 @@ blockmix_salsa8_xor(__m128i * Bin1, __m1 /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ XOR4(&Bin1[i * 8]) - SALSA20_8_XOR(&Bin2[i * 8], &Bout[i * 4]) - CONDOUT(Bdup, &Bdup[i * 4]) + SALSA20_8_XOR(&Bin2[i * 8]) + CONDOUT(&Bout[i * 4], Bxor, &Bxor[i * 4]) } /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ XOR4(&Bin1[i * 8 + 4]) - SALSA20_8_XOR(&Bin2[i * 8 + 4], &Bout[(r + i) * 4 + 4]) - CONDOUT(Bdup, &Bdup[(r + i) * 4 + 4]) - - return _mm_cvtsi128_si32(X0); + SALSA20_8_XOR(&Bin2[i * 8 + 4]) + CONDOUT_RET(&Bout[(r + i) * 4 + 4], Bxor, &Bxor[(r + i) * 4 + 4]) } #undef ARX #undef SALSA20_2ROUNDS #undef SALSA20_8_XOR +#undef OUT #undef XOR4 #undef CONDOUT +#undef CONDOUT_RET /** * integerify(B, r): @@ -285,20 +310,16 @@ smix(uint8_t * B, size_t r, uint32_t N, V_j = (void *)((uintptr_t)(V) + j * 128 * r); - if (defeat_tmto) { - uint32_t nj = ~j & (N - 1); - V_nj = (void *)((uintptr_t)(V) + nj * 128 * r); - } + if (defeat_tmto) + V_nj = (void *)((uintptr_t)(V) + (j ^ 1) * 128 * r); /* 8: X <-- H(X \xor V_j) */ /* 7: j <-- Integerify(X) mod N */ j = blockmix_salsa8_xor(X, V_j, Y, V_nj, r) & (N - 1); V_j = (void *)((uintptr_t)(V) + j * 128 * r); - if (defeat_tmto) { - uint32_t nj = ~j & (N - 1); - V_nj = (void *)((uintptr_t)(V) + nj * 128 * r); - } + if (defeat_tmto) + V_nj = (void *)((uintptr_t)(V) + (j ^ 1) * 128 * r); /* 8: X <-- H(X \xor V_j) */ /* 7: j <-- Integerify(X) mod N */ @@ -350,7 +371,7 @@ crypto_escrypt(const uint8_t * passwd, s errno = EFBIG; goto err0; } - if (((N & (N - 1)) != 0) || (N == 0)) { + if (((N & (N - 1)) != 0) || (N < 2)) { errno = EINVAL; goto err0; }