diff --git a/epiphany-scrypt.c b/epiphany-scrypt.c index fc4437c..a22423c 100644 --- a/epiphany-scrypt.c +++ b/epiphany-scrypt.c @@ -40,18 +40,12 @@ #include "e_lib.h" #include "epiphany_mailbox.h" -// ((1023 / TMTO_RATIO) + 1) * 128 -#define SCRATCHBUF_SIZE 26317 -#define TMTO_RATIO 5 // Must be > 0 +#define TMTO_RATIO 5 +#define SCRATCHBUF_SIZE (((1024 + TMTO_RATIO - 1) / TMTO_RATIO) * 128) // This aproximation to division works fine up to a = 43694 #define DIVTMTO(a) ((26215 * (a))>>17) // If TMTO_RATIO changes you need redefine this macro - -#define DIV2(a) ((a)>>1) -#define MOD2(a) ((a) - (DIV2(a) << 1)) // This can be optimiced in ASM using carry - -#define DIV8(a) ((a)>>3) -#define MOD8(a) ((a) - (DIV8(a) << 3)) // This can be optimiced in ASM using carry +//#define DIVTMTO(a) ((a) / TMTO_RATIO) volatile shared_buf_t M[16] SECTION("shared_dram"); @@ -91,6 +85,74 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) dst[i] = htobe32(src[i]); } +/* SHA256 constants */ +static const uint32_t sha256_c[64] = { + 0x428a2f98, + 0x71374491, + 0xb5c0fbcf, + 0xe9b5dba5, + 0x3956c25b, + 0x59f111f1, + 0x923f82a4, + 0xab1c5ed5, + 0xd807aa98, + 0x12835b01, + 0x243185be, + 0x550c7dc3, + 0x72be5d74, + 0x80deb1fe, + 0x9bdc06a7, + 0xc19bf174, + 0xe49b69c1, + 0xefbe4786, + 0x0fc19dc6, + 0x240ca1cc, + 0x2de92c6f, + 0x4a7484aa, + 0x5cb0a9dc, + 0x76f988da, + 0x983e5152, + 0xa831c66d, + 0xb00327c8, + 0xbf597fc7, + 0xc6e00bf3, + 0xd5a79147, + 0x06ca6351, + 0x14292967, + 0x27b70a85, + 0x2e1b2138, + 0x4d2c6dfc, + 0x53380d13, + 0x650a7354, + 0x766a0abb, + 0x81c2c92e, + 0x92722c85, + 0xa2bfe8a1, + 0xa81a664b, + 0xc24b8b70, + 0xc76c51a3, + 0xd192e819, + 0xd6990624, + 0xf40e3585, + 0x106aa070, + 0x19a4c116, + 0x1e376c08, + 0x2748774c, + 0x34b0bcb5, + 0x391c0cb3, + 0x4ed8aa4a, + 0x5b9cca4f, + 0x682e6ff3, + 0x748f82ee, + 0x78a5636f, + 0x84c87814, + 0x8cc70208, + 0x90befffa, + 0xa4506ceb, + 0xbef9a3f7, + 0xc67178f2 +}; + /* Elementary functions used by SHA256 */ #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) @@ -109,22 +171,13 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) h = t0 + t1; /* Adjusted round function for rotating state */ -// #define RNDr(S, W, i, k) \ -// RND(S[(64 - i) % 8], S[(65 - i) % 8], \ -// S[(66 - i) % 8], S[(67 - i) % 8], \ -// S[(68 - i) % 8], S[(69 - i) % 8], \ -// S[(70 - i) % 8], S[(71 - i) % 8], \ -// W[i] + k) +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) & 7], S[(65 - i) & 7], \ + S[(66 - i) & 7], S[(67 - i) & 7], \ + S[(68 - i) & 7], S[(69 - i) & 7], \ + S[(70 - i) & 7], S[(71 - i) & 7], \ + W[i] + k) -static void -RNDr (uint32_t *S, uint32_t *W, int i, uint32_t k) { - uint32_t t0, t1; - RND(S[MOD8(64 - i)], S[MOD8(65 - i)], - S[MOD8(66 - i)], S[MOD8(67 - i)], - S[MOD8(68 - i)], S[MOD8(69 - i)], - S[MOD8(70 - i)], S[MOD8(71 - i)], - W[i] + k) -} /* * SHA256 block compression function. The 256-bit state is transformed via * the 512-bit input block to produce a new state. @@ -152,70 +205,8 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap) memcpy(S, state, 32); /* 3. Mix. */ - RNDr(S, W, 0, 0x428a2f98); - RNDr(S, W, 1, 0x71374491); - RNDr(S, W, 2, 0xb5c0fbcf); - RNDr(S, W, 3, 0xe9b5dba5); - RNDr(S, W, 4, 0x3956c25b); - RNDr(S, W, 5, 0x59f111f1); - RNDr(S, W, 6, 0x923f82a4); - RNDr(S, W, 7, 0xab1c5ed5); - RNDr(S, W, 8, 0xd807aa98); - RNDr(S, W, 9, 0x12835b01); - RNDr(S, W, 10, 0x243185be); - RNDr(S, W, 11, 0x550c7dc3); - RNDr(S, W, 12, 0x72be5d74); - RNDr(S, W, 13, 0x80deb1fe); - RNDr(S, W, 14, 0x9bdc06a7); - RNDr(S, W, 15, 0xc19bf174); - RNDr(S, W, 16, 0xe49b69c1); - RNDr(S, W, 17, 0xefbe4786); - RNDr(S, W, 18, 0x0fc19dc6); - RNDr(S, W, 19, 0x240ca1cc); - RNDr(S, W, 20, 0x2de92c6f); - RNDr(S, W, 21, 0x4a7484aa); - RNDr(S, W, 22, 0x5cb0a9dc); - RNDr(S, W, 23, 0x76f988da); - RNDr(S, W, 24, 0x983e5152); - RNDr(S, W, 25, 0xa831c66d); - RNDr(S, W, 26, 0xb00327c8); - RNDr(S, W, 27, 0xbf597fc7); - RNDr(S, W, 28, 0xc6e00bf3); - RNDr(S, W, 29, 0xd5a79147); - RNDr(S, W, 30, 0x06ca6351); - RNDr(S, W, 31, 0x14292967); - RNDr(S, W, 32, 0x27b70a85); - RNDr(S, W, 33, 0x2e1b2138); - RNDr(S, W, 34, 0x4d2c6dfc); - RNDr(S, W, 35, 0x53380d13); - RNDr(S, W, 36, 0x650a7354); - RNDr(S, W, 37, 0x766a0abb); - RNDr(S, W, 38, 0x81c2c92e); - RNDr(S, W, 39, 0x92722c85); - RNDr(S, W, 40, 0xa2bfe8a1); - RNDr(S, W, 41, 0xa81a664b); - RNDr(S, W, 42, 0xc24b8b70); - RNDr(S, W, 43, 0xc76c51a3); - RNDr(S, W, 44, 0xd192e819); - RNDr(S, W, 45, 0xd6990624); - RNDr(S, W, 46, 0xf40e3585); - RNDr(S, W, 47, 0x106aa070); - RNDr(S, W, 48, 0x19a4c116); - RNDr(S, W, 49, 0x1e376c08); - RNDr(S, W, 50, 0x2748774c); - RNDr(S, W, 51, 0x34b0bcb5); - RNDr(S, W, 52, 0x391c0cb3); - RNDr(S, W, 53, 0x4ed8aa4a); - RNDr(S, W, 54, 0x5b9cca4f); - RNDr(S, W, 55, 0x682e6ff3); - RNDr(S, W, 56, 0x748f82ee); - RNDr(S, W, 57, 0x78a5636f); - RNDr(S, W, 58, 0x84c87814); - RNDr(S, W, 59, 0x8cc70208); - RNDr(S, W, 60, 0x90befffa); - RNDr(S, W, 61, 0xa4506ceb); - RNDr(S, W, 62, 0xbef9a3f7); - RNDr(S, W, 63, 0xc67178f2); + for (i = 0; i < 64; i++) + RNDr(S, W, i, sha256_c[i]); /* 4. Mix local working variables into global state */ for (i = 0; i < 8; i++) @@ -226,14 +217,17 @@ static inline void SHA256_InitState(uint32_t * state) { /* Magic initialization constants */ - state[0] = 0x6A09E667; - state[1] = 0xBB67AE85; - state[2] = 0x3C6EF372; - state[3] = 0xA54FF53A; - state[4] = 0x510E527F; - state[5] = 0x9B05688C; - state[6] = 0x1F83D9AB; - state[7] = 0x5BE0CD19; + static const uint32_t sha256_i[8] = { + 0x6A09E667, + 0xBB67AE85, + 0x3C6EF372, + 0xA54FF53A, + 0x510E527F, + 0x9B05688C, + 0x1F83D9AB, + 0x5BE0CD19 + }; + memcpy(state, sha256_i, 32); } static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000}; @@ -407,19 +401,19 @@ salsa20_8(const uint32_t B[16], const uint32_t Bx[16], uint32_t Bout[16]) Bout[15] += Bor[15]; } +static char scratchpad[SCRATCHBUF_SIZE] __attribute__ ((aligned(8))); + /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes */ -static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate) +static inline void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate) { + uint32_t V_TMP[64] __attribute__ ((aligned(8))); uint32_t * V, *X, *Z, *Y; - uint32_t V_TMP[64]; uint32_t i; uint32_t j; uint32_t k; - char scratchpad[SCRATCHBUF_SIZE]; - X = V = (uint32_t *) scratchpad; PBKDF2_SHA256_80_128(input, X); @@ -429,7 +423,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate) if (!((i+1) - ibase * TMTO_RATIO)) Y = &V[ibase * 32]; else - Y = &V_TMP[32*(MOD2(i+1))]; + Y = &V_TMP[32*((i+1) & 1)]; salsa20_8(&X[ 0], &X[16], &Y[ 0]); salsa20_8(&X[16], &Y[ 0], &Y[16]); @@ -445,7 +439,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate) Z = &V[jbase * 32]; while (jmod--) { - Y = &Vz_TMP[32*(MOD2(jmod+1))]; + Y = &Vz_TMP[32*((jmod+1) & 1)]; salsa20_8(&Z[ 0], &Z[16], &Y[ 0]); salsa20_8(&Z[16], &Y[ 0], &Y[16]); Z = Y; @@ -465,6 +459,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate) PBKDF2_SHA256_80_128_32(input, X, ostate); } +__attribute__ ((noreturn)) int main(void) { uint32_t core_n = e_group_config.core_row * e_group_config.group_cols @@ -478,6 +473,8 @@ int main(void) { M[core_n].ostate = ostate[7]; M[core_n].working = 0; } - - return 0; } + +/* Save ~300 bytes by getting rid of the real exit() */ +__attribute__ ((noreturn)) +void exit(int status) { while(1) continue; }