Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Sat, 8 May 2010 18:56:20 +0400
From: Solar Designer <solar@...nwall.com>
To: john-users@...ts.openwall.com
Subject: OpenMP

Hi,

Some of you might recall my past comments on OpenMP:

http://www.openwall.com/lists/john-users/2006/05/23/1
http://openwall.info/wiki/internal/gcc-local-build#Application-to-John-the-Ripper

While this is not the most efficient way to parallelize JtR, it does
have its advantages - most notably, simplicity of code changes (when we
talk about a single hash type) and ease of use.  So I have sort of
temporarily "given up" (procrastinating a "proper" parallel processing
implementation for JtR) and implemented OpenMP support for one of the
hash types.

Attached is a patch against JtR 1.7.5 to crack OpenBSD bcrypt hashes
fast. ;-)  I've also uploaded the patch to the wiki:

http://openwall.info/wiki/john/patches

It uses OpenMP - tested with gcc 4.5.0 (on Linux) and Sun Studio 12 (on
Solaris).  The only new requirement is an OpenMP-capable C compiler,
such as recent gcc.  Overall, this is easier and more reliable to use
than "MPI-patched" builds of JtR are, but drawbacks do exist as well
(per-hash-type code and a performance hit - see below).

I've measured the efficiency (vs. multiple separate-process instances of
JtR) to be as high as 98.5% for a build with gcc on otherwise idle Linux
systems with Intel CPUs.  With Sun Studio, Solaris, and Opterons, it was
down to between 78% and 91% (I did not investigate why).  Multi-threaded
code involves more complicated addressing modes and synchronization
between the threads, so some performance hit (vs. multiple separate
processes) is to be expected - but the ease of use is great.

Core i7 920 2.67 GHz:

Benchmarking: OpenBSD Blowfish (x32) [32/64 X2]... DONE
Raw:    3698 c/s real, 462 c/s virtual

The first number is actual speed, the second is per-thread (8 threads,
so the benchmark uses roughly 8 times more CPU time than real time).

That's 5.27x the speed of a non-threaded build (which does 702 c/s) -
not bad for a quad-core with SMT.

Dual Xeon X5460 3.16 GHz:

Benchmarking: OpenBSD Blowfish (x32) [32/64 X2]... DONE
Raw:    6528 c/s real, 816 c/s virtual

Now this is a "true" 8-core system, and we get 7.91x the speed of a
non-threaded build (which does 825 c/s).

In both cases, each thread was computing 2 hashes in parallel (for
greater instruction-level parallelism), so there were a total of 16
hashes being computed in parallel.

With gcc/Linux, the number of threads to run equals the number of
logical CPUs by default (e.g., 8 on a Core i7 with Hyperthreading).
It may be adjusted with the OMP_NUM_THREADS environment variable.
With Sun Studio, setting this variable is mandatory (otherwise only a
single thread is run).  For example:

OMP_NUM_THREADS=4 ../run/john --test=1 --format=bf

It may also make sense to set OMP_WAIT_POLICY=PASSIVE to free up a
little bit of CPU time (make it idle).  At least in theory, this may
improve the real c/s rate on CPUs with SMT, but make it a little bit
worse on others.

Here's how to quickly build gcc 4.5.0 as a user:

http://openwall.info/wiki/internal/gcc-local-build

The first (simpler) build shown on this wiki page will do.

Have fun.  As usual, any feedback is welcome.

Alexander

diff -urp john-1.7.5/src/BF_fmt.c john-1.7.5-omp-1/src/BF_fmt.c
--- john-1.7.5/src/BF_fmt.c	2010-01-16 17:20:38 +0000
+++ john-1.7.5-omp-1/src/BF_fmt.c	2010-05-08 12:29:58 +0000
@@ -149,7 +149,13 @@ static void crypt_all(int count)
 
 static int cmp_all(void *binary, int count)
 {
-#if BF_X2
+#if BF_N > 2
+	int i;
+	for (i = 0; i < count; i++)
+		if (*(BF_word *)binary == BF_out[i][0])
+			return 1;
+	return 0;
+#elif BF_N == 2
 	return
 	    *(BF_word *)binary == BF_out[0][0] ||
 	    *(BF_word *)binary == BF_out[1][0];
diff -urp john-1.7.5/src/BF_std.c john-1.7.5-omp-1/src/BF_std.c
--- john-1.7.5/src/BF_std.c	2008-06-22 00:37:57 +0000
+++ john-1.7.5-omp-1/src/BF_std.c	2010-05-08 13:37:31 +0000
@@ -1,6 +1,6 @@
 /*
  * This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2008 by Solar Designer
+ * Copyright (c) 1996-2001,2008,2010 by Solar Designer
  *
  * A public domain version of this code, with reentrant and crypt(3)
  * interfaces added, but optimizations specific to password cracking
@@ -41,7 +41,7 @@ struct BF_ctx {
 	BF_key P;
 };
 
-#if BF_X2
+#if BF_N > 1
 #define INDICES				[BF_N]
 #define INDEX				[index]
 #define INDEX0				[index]
@@ -54,6 +54,35 @@ struct BF_ctx {
 #define for_each_index()
 #endif
 
+#if BF_X2
+#if BF_mt > 1
+#define INDEX2				[index & 1]
+#else
+#define INDEX2				[index]
+#endif
+#else
+#define INDEX2
+#endif
+
+#if BF_mt > 1
+#if BF_X2
+#define for_each_t() \
+	for (t = 0; t < BF_N; t += 2)
+#define for_each_ti() \
+	for (index = t; index <= t + 1; index++)
+#else
+#define for_each_t() \
+	for (t = 0; t < BF_N; t++)
+#define for_each_ti() \
+	index = t;
+#endif
+#else
+#define for_each_t()
+#define for_each_ti() \
+	for_each_index()
+#endif
+
+#if BF_mt == 1
 /* Current Blowfish context */
 #if BF_ASM
 extern
@@ -61,6 +90,7 @@ extern
 static
 #endif
 struct BF_ctx CC_CACHE_ALIGN BF_current INDICES;
+#endif
 
 /* Current Blowfish key */
 static BF_key CC_CACHE_ALIGN BF_exp_key INDICES;
@@ -569,120 +599,163 @@ void BF_std_set_key(char *key, int index
 
 void BF_std_crypt(BF_salt salt)
 {
-	BF_word L0, R0;
-	BF_word u1, u2, u3, u4;
+#if BF_mt > 1
+	int t;
+#endif
+
+#if BF_mt > 1 && defined(_OPENMP)
+#pragma omp parallel for default(none) private(t) shared(BF_init_state, BF_init_key, BF_exp_key, salt, BF_magic_w, BF_out)
+#endif
+	for_each_t() {
+#if BF_mt > 1
 #if BF_X2
-	BF_word L1, R1;
-	BF_word v1, v2, v3, v4;
-	int index;
+		struct BF_ctx BF_current[2];
+#else
+		struct BF_ctx BF_current;
+#endif
 #endif
-	BF_word *ptr;
-	BF_word count;
-	int i;
 
-	for_each_index() {
-		memcpy(BF_current INDEX.S,
-		    BF_init_state.S, sizeof(BF_current INDEX.S));
-		memcpy(BF_current INDEX.P,
-		    BF_init_key INDEX, sizeof(BF_current INDEX.P));
-	}
+		BF_word L0, R0;
+		BF_word u1, u2, u3, u4;
+#if BF_X2
+		BF_word L1, R1;
+		BF_word v1, v2, v3, v4;
+#endif
+		BF_word *ptr;
+		BF_word count;
+#if BF_N > 1
+		int index;
+#endif
+
+		for_each_ti() {
+			int i;
 
-	for_each_index() {
-		L0 = R0 = 0;
-		for (i = 0; i < BF_ROUNDS + 2; i += 2) {
-			L0 ^= salt[i & 2];
-			R0 ^= salt[(i & 2) + 1];
-			BF_ENCRYPT(BF_current INDEX, L0, R0);
-			BF_current INDEX.P[i] = L0;
-			BF_current INDEX.P[i + 1] = R0;
+			memcpy(BF_current INDEX2.S,
+			    BF_init_state.S, sizeof(BF_current INDEX2.S));
+			memcpy(BF_current INDEX2.P,
+			    BF_init_key INDEX, sizeof(BF_current INDEX2.P));
+
+			L0 = R0 = 0;
+			for (i = 0; i < BF_ROUNDS + 2; i += 2) {
+				L0 ^= salt[i & 2];
+				R0 ^= salt[(i & 2) + 1];
+				BF_ENCRYPT(BF_current INDEX2, L0, R0);
+				BF_current INDEX2.P[i] = L0;
+				BF_current INDEX2.P[i + 1] = R0;
+			}
+
+			ptr = BF_current INDEX2.S[0];
+			do {
+				ptr += 4;
+				L0 ^= salt[(BF_ROUNDS + 2) & 3];
+				R0 ^= salt[(BF_ROUNDS + 3) & 3];
+				BF_ENCRYPT(BF_current INDEX2, L0, R0);
+				*(ptr - 4) = L0;
+				*(ptr - 3) = R0;
+
+				L0 ^= salt[(BF_ROUNDS + 4) & 3];
+				R0 ^= salt[(BF_ROUNDS + 5) & 3];
+				BF_ENCRYPT(BF_current INDEX2, L0, R0);
+				*(ptr - 2) = L0;
+				*(ptr - 1) = R0;
+			} while (ptr < &BF_current INDEX2.S[3][0xFF]);
 		}
 
-		ptr = BF_current INDEX.S[0];
+		count = salt[4];
 		do {
-			ptr += 4;
-			L0 ^= salt[(BF_ROUNDS + 2) & 3];
-			R0 ^= salt[(BF_ROUNDS + 3) & 3];
-			BF_ENCRYPT(BF_current INDEX, L0, R0);
-			*(ptr - 4) = L0;
-			*(ptr - 3) = R0;
-
-			L0 ^= salt[(BF_ROUNDS + 4) & 3];
-			R0 ^= salt[(BF_ROUNDS + 5) & 3];
-			BF_ENCRYPT(BF_current INDEX, L0, R0);
-			*(ptr - 2) = L0;
-			*(ptr - 1) = R0;
-		} while (ptr < &BF_current INDEX.S[3][0xFF]);
-	}
+			for_each_ti() {
+				BF_current INDEX2.P[0] ^= BF_exp_key INDEX[0];
+				BF_current INDEX2.P[1] ^= BF_exp_key INDEX[1];
+				BF_current INDEX2.P[2] ^= BF_exp_key INDEX[2];
+				BF_current INDEX2.P[3] ^= BF_exp_key INDEX[3];
+				BF_current INDEX2.P[4] ^= BF_exp_key INDEX[4];
+				BF_current INDEX2.P[5] ^= BF_exp_key INDEX[5];
+				BF_current INDEX2.P[6] ^= BF_exp_key INDEX[6];
+				BF_current INDEX2.P[7] ^= BF_exp_key INDEX[7];
+				BF_current INDEX2.P[8] ^= BF_exp_key INDEX[8];
+				BF_current INDEX2.P[9] ^= BF_exp_key INDEX[9];
+				BF_current INDEX2.P[10] ^= BF_exp_key INDEX[10];
+				BF_current INDEX2.P[11] ^= BF_exp_key INDEX[11];
+				BF_current INDEX2.P[12] ^= BF_exp_key INDEX[12];
+				BF_current INDEX2.P[13] ^= BF_exp_key INDEX[13];
+				BF_current INDEX2.P[14] ^= BF_exp_key INDEX[14];
+				BF_current INDEX2.P[15] ^= BF_exp_key INDEX[15];
+				BF_current INDEX2.P[16] ^= BF_exp_key INDEX[16];
+				BF_current INDEX2.P[17] ^= BF_exp_key INDEX[17];
+			}
+
+			BF_body();
+
+			u1 = salt[0];
+			u2 = salt[1];
+			u3 = salt[2];
+			u4 = salt[3];
+			for_each_ti() {
+				BF_current INDEX2.P[0] ^= u1;
+				BF_current INDEX2.P[1] ^= u2;
+				BF_current INDEX2.P[2] ^= u3;
+				BF_current INDEX2.P[3] ^= u4;
+				BF_current INDEX2.P[4] ^= u1;
+				BF_current INDEX2.P[5] ^= u2;
+				BF_current INDEX2.P[6] ^= u3;
+				BF_current INDEX2.P[7] ^= u4;
+				BF_current INDEX2.P[8] ^= u1;
+				BF_current INDEX2.P[9] ^= u2;
+				BF_current INDEX2.P[10] ^= u3;
+				BF_current INDEX2.P[11] ^= u4;
+				BF_current INDEX2.P[12] ^= u1;
+				BF_current INDEX2.P[13] ^= u2;
+				BF_current INDEX2.P[14] ^= u3;
+				BF_current INDEX2.P[15] ^= u4;
+				BF_current INDEX2.P[16] ^= u1;
+				BF_current INDEX2.P[17] ^= u2;
+			}
 
-	count = salt[4];
-	do {
-		for_each_index() {
-			BF_current INDEX.P[0] ^= BF_exp_key INDEX[0];
-			BF_current INDEX.P[1] ^= BF_exp_key INDEX[1];
-			BF_current INDEX.P[2] ^= BF_exp_key INDEX[2];
-			BF_current INDEX.P[3] ^= BF_exp_key INDEX[3];
-			BF_current INDEX.P[4] ^= BF_exp_key INDEX[4];
-			BF_current INDEX.P[5] ^= BF_exp_key INDEX[5];
-			BF_current INDEX.P[6] ^= BF_exp_key INDEX[6];
-			BF_current INDEX.P[7] ^= BF_exp_key INDEX[7];
-			BF_current INDEX.P[8] ^= BF_exp_key INDEX[8];
-			BF_current INDEX.P[9] ^= BF_exp_key INDEX[9];
-			BF_current INDEX.P[10] ^= BF_exp_key INDEX[10];
-			BF_current INDEX.P[11] ^= BF_exp_key INDEX[11];
-			BF_current INDEX.P[12] ^= BF_exp_key INDEX[12];
-			BF_current INDEX.P[13] ^= BF_exp_key INDEX[13];
-			BF_current INDEX.P[14] ^= BF_exp_key INDEX[14];
-			BF_current INDEX.P[15] ^= BF_exp_key INDEX[15];
-			BF_current INDEX.P[16] ^= BF_exp_key INDEX[16];
-			BF_current INDEX.P[17] ^= BF_exp_key INDEX[17];
-		}
+			BF_body();
+		} while (--count);
 
-		BF_body();
+#if BF_mt == 1
+		for_each_ti() {
+			L0 = BF_magic_w[0];
+			R0 = BF_magic_w[1];
+
+			count = 64;
+			do {
+				BF_ENCRYPT(BF_current INDEX, L0, R0);
+			} while (--count);
 
-		u1 = salt[0];
-		u2 = salt[1];
-		u3 = salt[2];
-		u4 = salt[3];
-		for_each_index() {
-			BF_current INDEX.P[0] ^= u1;
-			BF_current INDEX.P[1] ^= u2;
-			BF_current INDEX.P[2] ^= u3;
-			BF_current INDEX.P[3] ^= u4;
-			BF_current INDEX.P[4] ^= u1;
-			BF_current INDEX.P[5] ^= u2;
-			BF_current INDEX.P[6] ^= u3;
-			BF_current INDEX.P[7] ^= u4;
-			BF_current INDEX.P[8] ^= u1;
-			BF_current INDEX.P[9] ^= u2;
-			BF_current INDEX.P[10] ^= u3;
-			BF_current INDEX.P[11] ^= u4;
-			BF_current INDEX.P[12] ^= u1;
-			BF_current INDEX.P[13] ^= u2;
-			BF_current INDEX.P[14] ^= u3;
-			BF_current INDEX.P[15] ^= u4;
-			BF_current INDEX.P[16] ^= u1;
-			BF_current INDEX.P[17] ^= u2;
+			BF_out INDEX0[0] = L0;
+			BF_out INDEX0[1] = R0;
 		}
+#else
+		for_each_ti() {
+			BF_word L, R;
+			BF_word u1, u2, u3, u4;
+			BF_word count;
+			int i;
+
+			memcpy(&BF_out[index], &BF_magic_w, sizeof(BF_out[index]));
+
+			count = 64;
+			do
+			for (i = 0; i < 6; i += 2) {
+				L = BF_out[index][i];
+				R = BF_out[index][i + 1];
+				BF_ENCRYPT(BF_current INDEX2, L, R);
+				BF_out[index][i] = L;
+				BF_out[index][i + 1] = R;
+			} while (--count);
 
-		BF_body();
-	} while (--count);
-
-	for_each_index() {
-		L0 = BF_magic_w[0];
-		R0 = BF_magic_w[1];
-
-		count = 64;
-		do {
-			BF_ENCRYPT(BF_current INDEX, L0, R0);
-		} while (--count);
-
-		BF_out INDEX0[0] = L0;
-		BF_out INDEX0[1] = R0;
+/* This has to be bug-compatible with the original implementation :-) */
+			BF_out[index][5] &= ~(BF_word)0xFF;
+		}
+#endif
 	}
 }
 
 void BF_std_crypt_exact(int index)
 {
+#if BF_mt == 1
 	BF_word L, R;
 	BF_word u1, u2, u3, u4;
 	BF_word count;
@@ -702,6 +775,7 @@ void BF_std_crypt_exact(int index)
 
 /* This has to be bug-compatible with the original implementation :-) */
 	BF_out[index][5] &= ~(BF_word)0xFF;
+#endif
 }
 
 /*
diff -urp john-1.7.5/src/BF_std.h john-1.7.5-omp-1/src/BF_std.h
--- john-1.7.5/src/BF_std.h	2008-06-21 23:24:37 +0000
+++ john-1.7.5-omp-1/src/BF_std.h	2010-05-08 13:38:32 +0000
@@ -1,6 +1,6 @@
 /*
  * This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2008 by Solar Designer
+ * Copyright (c) 1996-2001,2008,2010 by Solar Designer
  */
 
 /*
@@ -25,10 +25,16 @@ typedef BF_word BF_salt[4 + 1];
  */
 typedef BF_word BF_binary[6];
 
+#ifdef _OPENMP
+#define BF_mt				24
+#else
+#define BF_mt				1
+#endif
+
 #if BF_X2
-#define BF_N				2
+#define BF_N				(2 * BF_mt)
 #else
-#define BF_N				1
+#define BF_N				(BF_mt)
 #endif
 
 /*
diff -urp john-1.7.5/src/Makefile john-1.7.5-omp-1/src/Makefile
--- john-1.7.5/src/Makefile	2009-12-17 19:11:03 +0000
+++ john-1.7.5-omp-1/src/Makefile	2010-05-08 13:47:59 +0000
@@ -1,6 +1,6 @@
 #
 # This file is part of John the Ripper password cracker,
-# Copyright (c) 1996-2009 by Solar Designer
+# Copyright (c) 1996-2010 by Solar Designer
 #
 
 CC = gcc
@@ -15,9 +15,9 @@ SED = sed
 PERL = perl
 NULL = /dev/null
 CPPFLAGS = -E
-CFLAGS = -c -Wall -O2 -fomit-frame-pointer
+CFLAGS = -c -Wall -O2 -fomit-frame-pointer -fopenmp
 ASFLAGS = -c
-LDFLAGS = -s
+LDFLAGS = -s -fopenmp
 OPT_NORMAL = -funroll-loops
 OPT_INLINE = -finline-functions
 
@@ -441,9 +441,9 @@ solaris-x86-64-cc:
 	$(MAKE) $(PROJ) \
 		JOHN_OBJS="$(JOHN_OBJS_MINIMAL) x86-64.o" \
 		CC=cc \
-		CFLAGS="-c -fast -xarch=native64" \
+		CFLAGS="-c -fast -xarch=native64 -xopenmp" \
 		ASFLAGS="-c -xarch=native64" \
-		LDFLAGS="-s -xarch=native64 -lrt" \
+		LDFLAGS="-s -xarch=native64 -xopenmp -lrt" \
 		OPT_NORMAL="" \
 		OPT_INLINE="-xinline=s1,s2,s3,s4,s5,s6,s7,s8"
 
diff -urp john-1.7.5/src/bench.c john-1.7.5-omp-1/src/bench.c
--- john-1.7.5/src/bench.c	2009-09-09 04:10:40 +0000
+++ john-1.7.5-omp-1/src/bench.c	2010-05-08 13:39:44 +0000
@@ -1,6 +1,6 @@
 /*
  * This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2003,2004,2006,2008,2009 by Solar Designer
+ * Copyright (c) 1996-2001,2003,2004,2006,2008-2010 by Solar Designer
  */
 
 #ifdef __ultrix__
@@ -158,6 +158,7 @@ char *benchmark_format(struct fmt_main *
 
 	start_real = times(&buf);
 	start_virtual = buf.tms_utime + buf.tms_stime;
+	start_virtual += buf.tms_cutime + buf.tms_cstime;
 	count = 0;
 
 	index = salts;
@@ -182,6 +183,7 @@ char *benchmark_format(struct fmt_main *
 
 	end_real = times(&buf);
 	end_virtual = buf.tms_utime + buf.tms_stime;
+	end_virtual += buf.tms_cutime + buf.tms_cstime;
 	if (end_virtual == start_virtual) end_virtual++;
 
 	results->real = end_real - start_real;
diff -urp john-1.7.5/src/params.h john-1.7.5-omp-1/src/params.h
--- john-1.7.5/src/params.h	2010-02-24 18:20:16 +0000
+++ john-1.7.5-omp-1/src/params.h	2010-05-08 13:43:22 +0000
@@ -15,7 +15,7 @@
 /*
  * John's version number.
  */
-#define JOHN_VERSION			"1.7.5"
+#define JOHN_VERSION			"1.7.5-omp-1"
 
 /*
  * Notes to packagers of John for *BSD "ports", Linux distributions, etc.:

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ