Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Sun, 25 Aug 2013 03:37:36 +0400
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: Re: Parallella: Litecoin mining

Rafael,

On Sun, Aug 25, 2013 at 03:36:06AM +0400, Solar Designer wrote:
> I took a look at your committed code - it tries to use TMTO 5, but it
> just gets stuck somewhere.  So I've just spent an hour playing around
> with it, optimizing its memory usage.  Please see the attached patch.

Oops, I forgot to re-attach the patch when moving the message body to
the proper thread.  I've attached the patch now.

> With this patch, the code + read-only data size is reduced by about 1700
> bytes, and it pretends to work, but when I enable the debugging output
> in driver-epiphany.c, the hashes computed on ARM and Epiphany don't
> match.  Moreover, they don't match even if I reduce TMTO to 6 (and
> adjust DIVTMTO accordingly).  My guess is that you had introduced some
> bug, so I am leaving it up to you to debug it. ;-)  It is, of course,
> also possible that the bug is in my patch.

Alexander

diff --git a/epiphany-scrypt.c b/epiphany-scrypt.c
index fc4437c..a22423c 100644
--- a/epiphany-scrypt.c
+++ b/epiphany-scrypt.c
@@ -40,18 +40,12 @@
 #include "e_lib.h"
 #include "epiphany_mailbox.h"
 
-// ((1023 / TMTO_RATIO) + 1) * 128
-#define SCRATCHBUF_SIZE	26317
-#define TMTO_RATIO 5 // Must be > 0
+#define TMTO_RATIO 5
+#define SCRATCHBUF_SIZE	(((1024 + TMTO_RATIO - 1) / TMTO_RATIO) * 128)
 
 // This aproximation to division works fine up to a = 43694
 #define DIVTMTO(a) ((26215 * (a))>>17) // If TMTO_RATIO changes you need redefine this macro
-
-#define DIV2(a) ((a)>>1)
-#define MOD2(a) ((a) - (DIV2(a) << 1)) // This can be optimiced in ASM using carry
-
-#define DIV8(a) ((a)>>3)
-#define MOD8(a) ((a) - (DIV8(a) << 3)) // This can be optimiced in ASM using carry
+//#define DIVTMTO(a) ((a) / TMTO_RATIO)
 
 volatile shared_buf_t M[16] SECTION("shared_dram");
 
@@ -91,6 +85,74 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
 		dst[i] = htobe32(src[i]);
 }
 
+/* SHA256 constants */
+static const uint32_t sha256_c[64] = {
+	0x428a2f98,
+	0x71374491,
+	0xb5c0fbcf,
+	0xe9b5dba5,
+	0x3956c25b,
+	0x59f111f1,
+	0x923f82a4,
+	0xab1c5ed5,
+	0xd807aa98,
+	0x12835b01,
+	0x243185be,
+	0x550c7dc3,
+	0x72be5d74,
+	0x80deb1fe,
+	0x9bdc06a7,
+	0xc19bf174,
+	0xe49b69c1,
+	0xefbe4786,
+	0x0fc19dc6,
+	0x240ca1cc,
+	0x2de92c6f,
+	0x4a7484aa,
+	0x5cb0a9dc,
+	0x76f988da,
+	0x983e5152,
+	0xa831c66d,
+	0xb00327c8,
+	0xbf597fc7,
+	0xc6e00bf3,
+	0xd5a79147,
+	0x06ca6351,
+	0x14292967,
+	0x27b70a85,
+	0x2e1b2138,
+	0x4d2c6dfc,
+	0x53380d13,
+	0x650a7354,
+	0x766a0abb,
+	0x81c2c92e,
+	0x92722c85,
+	0xa2bfe8a1,
+	0xa81a664b,
+	0xc24b8b70,
+	0xc76c51a3,
+	0xd192e819,
+	0xd6990624,
+	0xf40e3585,
+	0x106aa070,
+	0x19a4c116,
+	0x1e376c08,
+	0x2748774c,
+	0x34b0bcb5,
+	0x391c0cb3,
+	0x4ed8aa4a,
+	0x5b9cca4f,
+	0x682e6ff3,
+	0x748f82ee,
+	0x78a5636f,
+	0x84c87814,
+	0x8cc70208,
+	0x90befffa,
+	0xa4506ceb,
+	0xbef9a3f7,
+	0xc67178f2
+};
+
 /* Elementary functions used by SHA256 */
 #define Ch(x, y, z)	((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)	((x & (y | z)) | (y & z))
@@ -109,22 +171,13 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
 	h  = t0 + t1;
 
 /* Adjusted round function for rotating state */
-// #define RNDr(S, W, i, k)			\
-// 	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
-// 	    S[(66 - i) % 8], S[(67 - i) % 8],	\
-// 	    S[(68 - i) % 8], S[(69 - i) % 8],	\
-// 	    S[(70 - i) % 8], S[(71 - i) % 8],	\
-// 	    W[i] + k)
+#define RNDr(S, W, i, k)			\
+ 	RND(S[(64 - i) & 7], S[(65 - i) & 7],	\
+ 	    S[(66 - i) & 7], S[(67 - i) & 7],	\
+ 	    S[(68 - i) & 7], S[(69 - i) & 7],	\
+ 	    S[(70 - i) & 7], S[(71 - i) & 7],	\
+ 	    W[i] + k)
 
-static void
-RNDr (uint32_t *S, uint32_t *W, int i, uint32_t k) {
-	uint32_t t0, t1;
-	RND(S[MOD8(64 - i)], S[MOD8(65 - i)],
-	    S[MOD8(66 - i)], S[MOD8(67 - i)],
-	    S[MOD8(68 - i)], S[MOD8(69 - i)],
-	    S[MOD8(70 - i)], S[MOD8(71 - i)],
-	    W[i] + k)
-}
 /*
  * SHA256 block compression function.  The 256-bit state is transformed via
  * the 512-bit input block to produce a new state.
@@ -152,70 +205,8 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
 	memcpy(S, state, 32);
 
 	/* 3. Mix. */
-	RNDr(S, W, 0, 0x428a2f98);
-	RNDr(S, W, 1, 0x71374491);
-	RNDr(S, W, 2, 0xb5c0fbcf);
-	RNDr(S, W, 3, 0xe9b5dba5);
-	RNDr(S, W, 4, 0x3956c25b);
-	RNDr(S, W, 5, 0x59f111f1);
-	RNDr(S, W, 6, 0x923f82a4);
-	RNDr(S, W, 7, 0xab1c5ed5);
-	RNDr(S, W, 8, 0xd807aa98);
-	RNDr(S, W, 9, 0x12835b01);
-	RNDr(S, W, 10, 0x243185be);
-	RNDr(S, W, 11, 0x550c7dc3);
-	RNDr(S, W, 12, 0x72be5d74);
-	RNDr(S, W, 13, 0x80deb1fe);
-	RNDr(S, W, 14, 0x9bdc06a7);
-	RNDr(S, W, 15, 0xc19bf174);
-	RNDr(S, W, 16, 0xe49b69c1);
-	RNDr(S, W, 17, 0xefbe4786);
-	RNDr(S, W, 18, 0x0fc19dc6);
-	RNDr(S, W, 19, 0x240ca1cc);
-	RNDr(S, W, 20, 0x2de92c6f);
-	RNDr(S, W, 21, 0x4a7484aa);
-	RNDr(S, W, 22, 0x5cb0a9dc);
-	RNDr(S, W, 23, 0x76f988da);
-	RNDr(S, W, 24, 0x983e5152);
-	RNDr(S, W, 25, 0xa831c66d);
-	RNDr(S, W, 26, 0xb00327c8);
-	RNDr(S, W, 27, 0xbf597fc7);
-	RNDr(S, W, 28, 0xc6e00bf3);
-	RNDr(S, W, 29, 0xd5a79147);
-	RNDr(S, W, 30, 0x06ca6351);
-	RNDr(S, W, 31, 0x14292967);
-	RNDr(S, W, 32, 0x27b70a85);
-	RNDr(S, W, 33, 0x2e1b2138);
-	RNDr(S, W, 34, 0x4d2c6dfc);
-	RNDr(S, W, 35, 0x53380d13);
-	RNDr(S, W, 36, 0x650a7354);
-	RNDr(S, W, 37, 0x766a0abb);
-	RNDr(S, W, 38, 0x81c2c92e);
-	RNDr(S, W, 39, 0x92722c85);
-	RNDr(S, W, 40, 0xa2bfe8a1);
-	RNDr(S, W, 41, 0xa81a664b);
-	RNDr(S, W, 42, 0xc24b8b70);
-	RNDr(S, W, 43, 0xc76c51a3);
-	RNDr(S, W, 44, 0xd192e819);
-	RNDr(S, W, 45, 0xd6990624);
-	RNDr(S, W, 46, 0xf40e3585);
-	RNDr(S, W, 47, 0x106aa070);
-	RNDr(S, W, 48, 0x19a4c116);
-	RNDr(S, W, 49, 0x1e376c08);
-	RNDr(S, W, 50, 0x2748774c);
-	RNDr(S, W, 51, 0x34b0bcb5);
-	RNDr(S, W, 52, 0x391c0cb3);
-	RNDr(S, W, 53, 0x4ed8aa4a);
-	RNDr(S, W, 54, 0x5b9cca4f);
-	RNDr(S, W, 55, 0x682e6ff3);
-	RNDr(S, W, 56, 0x748f82ee);
-	RNDr(S, W, 57, 0x78a5636f);
-	RNDr(S, W, 58, 0x84c87814);
-	RNDr(S, W, 59, 0x8cc70208);
-	RNDr(S, W, 60, 0x90befffa);
-	RNDr(S, W, 61, 0xa4506ceb);
-	RNDr(S, W, 62, 0xbef9a3f7);
-	RNDr(S, W, 63, 0xc67178f2);
+	for (i = 0; i < 64; i++)
+		RNDr(S, W, i, sha256_c[i]);
 
 	/* 4. Mix local working variables into global state */
 	for (i = 0; i < 8; i++)
@@ -226,14 +217,17 @@ static inline void
 SHA256_InitState(uint32_t * state)
 {
 	/* Magic initialization constants */
-	state[0] = 0x6A09E667;
-	state[1] = 0xBB67AE85;
-	state[2] = 0x3C6EF372;
-	state[3] = 0xA54FF53A;
-	state[4] = 0x510E527F;
-	state[5] = 0x9B05688C;
-	state[6] = 0x1F83D9AB;
-	state[7] = 0x5BE0CD19;
+	static const uint32_t sha256_i[8] = {
+		0x6A09E667,
+		0xBB67AE85,
+		0x3C6EF372,
+		0xA54FF53A,
+		0x510E527F,
+		0x9B05688C,
+		0x1F83D9AB,
+		0x5BE0CD19
+	};
+	memcpy(state, sha256_i, 32);
 }
 
 static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
@@ -407,19 +401,19 @@ salsa20_8(const uint32_t B[16], const uint32_t Bx[16], uint32_t Bout[16])
 	Bout[15] += Bor[15];
 }
 
+static char scratchpad[SCRATCHBUF_SIZE] __attribute__ ((aligned(8)));
+
 /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
    scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
  */
-static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate)
+static inline void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate)
 {
+	uint32_t V_TMP[64] __attribute__ ((aligned(8)));
 	uint32_t * V, *X, *Z, *Y;
-	uint32_t V_TMP[64];
 	uint32_t i;
 	uint32_t j;
 	uint32_t k;
 
-	char scratchpad[SCRATCHBUF_SIZE];
-
 	X = V = (uint32_t *) scratchpad;
 
 	PBKDF2_SHA256_80_128(input, X);
@@ -429,7 +423,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate)
 		if (!((i+1) - ibase * TMTO_RATIO))
 			Y = &V[ibase * 32];
 		else
-			Y = &V_TMP[32*(MOD2(i+1))];
+			Y = &V_TMP[32*((i+1) & 1)];
 
 		salsa20_8(&X[ 0], &X[16], &Y[ 0]);
 		salsa20_8(&X[16], &Y[ 0], &Y[16]);
@@ -445,7 +439,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate)
 
 		Z  = &V[jbase * 32];
 		while (jmod--) {
-			Y = &Vz_TMP[32*(MOD2(jmod+1))];
+			Y = &Vz_TMP[32*((jmod+1) & 1)];
 			salsa20_8(&Z[ 0], &Z[16], &Y[ 0]);
 			salsa20_8(&Z[16], &Y[ 0], &Y[16]);
 			Z = Y;
@@ -465,6 +459,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *ostate)
 	PBKDF2_SHA256_80_128_32(input, X, ostate);
 }
 
+__attribute__ ((noreturn))
 int main(void) {
 
 	uint32_t core_n = e_group_config.core_row * e_group_config.group_cols
@@ -478,6 +473,8 @@ int main(void) {
 		M[core_n].ostate = ostate[7];
 		M[core_n].working = 0;
 	}
-
-	return 0;
 }
+
+/* Save ~300 bytes by getting rid of the real exit() */
+__attribute__ ((noreturn))
+void exit(int status) { while(1) continue; }

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ