[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Sat, 8 May 2010 18:56:20 +0400
From: Solar Designer <solar@...nwall.com>
To: john-users@...ts.openwall.com
Subject: OpenMP
Hi,
Some of you might recall my past comments on OpenMP:
http://www.openwall.com/lists/john-users/2006/05/23/1
http://openwall.info/wiki/internal/gcc-local-build#Application-to-John-the-Ripper
While this is not the most efficient way to parallelize JtR, it does
have its advantages - most notably, simplicity of code changes (when we
talk about a single hash type) and ease of use. So I have sort of
temporarily "given up" (procrastinating a "proper" parallel processing
implementation for JtR) and implemented OpenMP support for one of the
hash types.
Attached is a patch against JtR 1.7.5 to crack OpenBSD bcrypt hashes
fast. ;-) I've also uploaded the patch to the wiki:
http://openwall.info/wiki/john/patches
It uses OpenMP - tested with gcc 4.5.0 (on Linux) and Sun Studio 12 (on
Solaris). The only new requirement is an OpenMP-capable C compiler,
such as recent gcc. Overall, this is easier and more reliable to use
than "MPI-patched" builds of JtR are, but drawbacks do exist as well
(per-hash-type code and a performance hit - see below).
I've measured the efficiency (vs. multiple separate-process instances of
JtR) to be as high as 98.5% for a build with gcc on otherwise idle Linux
systems with Intel CPUs. With Sun Studio, Solaris, and Opterons, it was
down to between 78% and 91% (I did not investigate why). Multi-threaded
code involves more complicated addressing modes and synchronization
between the threads, so some performance hit (vs. multiple separate
processes) is to be expected - but the ease of use is great.
Core i7 920 2.67 GHz:
Benchmarking: OpenBSD Blowfish (x32) [32/64 X2]... DONE
Raw: 3698 c/s real, 462 c/s virtual
The first number is actual speed, the second is per-thread (8 threads,
so the benchmark uses roughly 8 times more CPU time than real time).
That's 5.27x the speed of a non-threaded build (which does 702 c/s) -
not bad for a quad-core with SMT.
Dual Xeon X5460 3.16 GHz:
Benchmarking: OpenBSD Blowfish (x32) [32/64 X2]... DONE
Raw: 6528 c/s real, 816 c/s virtual
Now this is a "true" 8-core system, and we get 7.91x the speed of a
non-threaded build (which does 825 c/s).
In both cases, each thread was computing 2 hashes in parallel (for
greater instruction-level parallelism), so there were a total of 16
hashes being computed in parallel.
With gcc/Linux, the number of threads to run equals the number of
logical CPUs by default (e.g., 8 on a Core i7 with Hyperthreading).
It may be adjusted with the OMP_NUM_THREADS environment variable.
With Sun Studio, setting this variable is mandatory (otherwise only a
single thread is run). For example:
OMP_NUM_THREADS=4 ../run/john --test=1 --format=bf
It may also make sense to set OMP_WAIT_POLICY=PASSIVE to free up a
little bit of CPU time (make it idle). At least in theory, this may
improve the real c/s rate on CPUs with SMT, but make it a little bit
worse on others.
Here's how to quickly build gcc 4.5.0 as a user:
http://openwall.info/wiki/internal/gcc-local-build
The first (simpler) build shown on this wiki page will do.
Have fun. As usual, any feedback is welcome.
Alexander
diff -urp john-1.7.5/src/BF_fmt.c john-1.7.5-omp-1/src/BF_fmt.c
--- john-1.7.5/src/BF_fmt.c 2010-01-16 17:20:38 +0000
+++ john-1.7.5-omp-1/src/BF_fmt.c 2010-05-08 12:29:58 +0000
@@ -149,7 +149,13 @@ static void crypt_all(int count)
static int cmp_all(void *binary, int count)
{
-#if BF_X2
+#if BF_N > 2
+ int i;
+ for (i = 0; i < count; i++)
+ if (*(BF_word *)binary == BF_out[i][0])
+ return 1;
+ return 0;
+#elif BF_N == 2
return
*(BF_word *)binary == BF_out[0][0] ||
*(BF_word *)binary == BF_out[1][0];
diff -urp john-1.7.5/src/BF_std.c john-1.7.5-omp-1/src/BF_std.c
--- john-1.7.5/src/BF_std.c 2008-06-22 00:37:57 +0000
+++ john-1.7.5-omp-1/src/BF_std.c 2010-05-08 13:37:31 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2008 by Solar Designer
+ * Copyright (c) 1996-2001,2008,2010 by Solar Designer
*
* A public domain version of this code, with reentrant and crypt(3)
* interfaces added, but optimizations specific to password cracking
@@ -41,7 +41,7 @@ struct BF_ctx {
BF_key P;
};
-#if BF_X2
+#if BF_N > 1
#define INDICES [BF_N]
#define INDEX [index]
#define INDEX0 [index]
@@ -54,6 +54,35 @@ struct BF_ctx {
#define for_each_index()
#endif
+#if BF_X2
+#if BF_mt > 1
+#define INDEX2 [index & 1]
+#else
+#define INDEX2 [index]
+#endif
+#else
+#define INDEX2
+#endif
+
+#if BF_mt > 1
+#if BF_X2
+#define for_each_t() \
+ for (t = 0; t < BF_N; t += 2)
+#define for_each_ti() \
+ for (index = t; index <= t + 1; index++)
+#else
+#define for_each_t() \
+ for (t = 0; t < BF_N; t++)
+#define for_each_ti() \
+ index = t;
+#endif
+#else
+#define for_each_t()
+#define for_each_ti() \
+ for_each_index()
+#endif
+
+#if BF_mt == 1
/* Current Blowfish context */
#if BF_ASM
extern
@@ -61,6 +90,7 @@ extern
static
#endif
struct BF_ctx CC_CACHE_ALIGN BF_current INDICES;
+#endif
/* Current Blowfish key */
static BF_key CC_CACHE_ALIGN BF_exp_key INDICES;
@@ -569,120 +599,163 @@ void BF_std_set_key(char *key, int index
void BF_std_crypt(BF_salt salt)
{
- BF_word L0, R0;
- BF_word u1, u2, u3, u4;
+#if BF_mt > 1
+ int t;
+#endif
+
+#if BF_mt > 1 && defined(_OPENMP)
+#pragma omp parallel for default(none) private(t) shared(BF_init_state, BF_init_key, BF_exp_key, salt, BF_magic_w, BF_out)
+#endif
+ for_each_t() {
+#if BF_mt > 1
#if BF_X2
- BF_word L1, R1;
- BF_word v1, v2, v3, v4;
- int index;
+ struct BF_ctx BF_current[2];
+#else
+ struct BF_ctx BF_current;
+#endif
#endif
- BF_word *ptr;
- BF_word count;
- int i;
- for_each_index() {
- memcpy(BF_current INDEX.S,
- BF_init_state.S, sizeof(BF_current INDEX.S));
- memcpy(BF_current INDEX.P,
- BF_init_key INDEX, sizeof(BF_current INDEX.P));
- }
+ BF_word L0, R0;
+ BF_word u1, u2, u3, u4;
+#if BF_X2
+ BF_word L1, R1;
+ BF_word v1, v2, v3, v4;
+#endif
+ BF_word *ptr;
+ BF_word count;
+#if BF_N > 1
+ int index;
+#endif
+
+ for_each_ti() {
+ int i;
- for_each_index() {
- L0 = R0 = 0;
- for (i = 0; i < BF_ROUNDS + 2; i += 2) {
- L0 ^= salt[i & 2];
- R0 ^= salt[(i & 2) + 1];
- BF_ENCRYPT(BF_current INDEX, L0, R0);
- BF_current INDEX.P[i] = L0;
- BF_current INDEX.P[i + 1] = R0;
+ memcpy(BF_current INDEX2.S,
+ BF_init_state.S, sizeof(BF_current INDEX2.S));
+ memcpy(BF_current INDEX2.P,
+ BF_init_key INDEX, sizeof(BF_current INDEX2.P));
+
+ L0 = R0 = 0;
+ for (i = 0; i < BF_ROUNDS + 2; i += 2) {
+ L0 ^= salt[i & 2];
+ R0 ^= salt[(i & 2) + 1];
+ BF_ENCRYPT(BF_current INDEX2, L0, R0);
+ BF_current INDEX2.P[i] = L0;
+ BF_current INDEX2.P[i + 1] = R0;
+ }
+
+ ptr = BF_current INDEX2.S[0];
+ do {
+ ptr += 4;
+ L0 ^= salt[(BF_ROUNDS + 2) & 3];
+ R0 ^= salt[(BF_ROUNDS + 3) & 3];
+ BF_ENCRYPT(BF_current INDEX2, L0, R0);
+ *(ptr - 4) = L0;
+ *(ptr - 3) = R0;
+
+ L0 ^= salt[(BF_ROUNDS + 4) & 3];
+ R0 ^= salt[(BF_ROUNDS + 5) & 3];
+ BF_ENCRYPT(BF_current INDEX2, L0, R0);
+ *(ptr - 2) = L0;
+ *(ptr - 1) = R0;
+ } while (ptr < &BF_current INDEX2.S[3][0xFF]);
}
- ptr = BF_current INDEX.S[0];
+ count = salt[4];
do {
- ptr += 4;
- L0 ^= salt[(BF_ROUNDS + 2) & 3];
- R0 ^= salt[(BF_ROUNDS + 3) & 3];
- BF_ENCRYPT(BF_current INDEX, L0, R0);
- *(ptr - 4) = L0;
- *(ptr - 3) = R0;
-
- L0 ^= salt[(BF_ROUNDS + 4) & 3];
- R0 ^= salt[(BF_ROUNDS + 5) & 3];
- BF_ENCRYPT(BF_current INDEX, L0, R0);
- *(ptr - 2) = L0;
- *(ptr - 1) = R0;
- } while (ptr < &BF_current INDEX.S[3][0xFF]);
- }
+ for_each_ti() {
+ BF_current INDEX2.P[0] ^= BF_exp_key INDEX[0];
+ BF_current INDEX2.P[1] ^= BF_exp_key INDEX[1];
+ BF_current INDEX2.P[2] ^= BF_exp_key INDEX[2];
+ BF_current INDEX2.P[3] ^= BF_exp_key INDEX[3];
+ BF_current INDEX2.P[4] ^= BF_exp_key INDEX[4];
+ BF_current INDEX2.P[5] ^= BF_exp_key INDEX[5];
+ BF_current INDEX2.P[6] ^= BF_exp_key INDEX[6];
+ BF_current INDEX2.P[7] ^= BF_exp_key INDEX[7];
+ BF_current INDEX2.P[8] ^= BF_exp_key INDEX[8];
+ BF_current INDEX2.P[9] ^= BF_exp_key INDEX[9];
+ BF_current INDEX2.P[10] ^= BF_exp_key INDEX[10];
+ BF_current INDEX2.P[11] ^= BF_exp_key INDEX[11];
+ BF_current INDEX2.P[12] ^= BF_exp_key INDEX[12];
+ BF_current INDEX2.P[13] ^= BF_exp_key INDEX[13];
+ BF_current INDEX2.P[14] ^= BF_exp_key INDEX[14];
+ BF_current INDEX2.P[15] ^= BF_exp_key INDEX[15];
+ BF_current INDEX2.P[16] ^= BF_exp_key INDEX[16];
+ BF_current INDEX2.P[17] ^= BF_exp_key INDEX[17];
+ }
+
+ BF_body();
+
+ u1 = salt[0];
+ u2 = salt[1];
+ u3 = salt[2];
+ u4 = salt[3];
+ for_each_ti() {
+ BF_current INDEX2.P[0] ^= u1;
+ BF_current INDEX2.P[1] ^= u2;
+ BF_current INDEX2.P[2] ^= u3;
+ BF_current INDEX2.P[3] ^= u4;
+ BF_current INDEX2.P[4] ^= u1;
+ BF_current INDEX2.P[5] ^= u2;
+ BF_current INDEX2.P[6] ^= u3;
+ BF_current INDEX2.P[7] ^= u4;
+ BF_current INDEX2.P[8] ^= u1;
+ BF_current INDEX2.P[9] ^= u2;
+ BF_current INDEX2.P[10] ^= u3;
+ BF_current INDEX2.P[11] ^= u4;
+ BF_current INDEX2.P[12] ^= u1;
+ BF_current INDEX2.P[13] ^= u2;
+ BF_current INDEX2.P[14] ^= u3;
+ BF_current INDEX2.P[15] ^= u4;
+ BF_current INDEX2.P[16] ^= u1;
+ BF_current INDEX2.P[17] ^= u2;
+ }
- count = salt[4];
- do {
- for_each_index() {
- BF_current INDEX.P[0] ^= BF_exp_key INDEX[0];
- BF_current INDEX.P[1] ^= BF_exp_key INDEX[1];
- BF_current INDEX.P[2] ^= BF_exp_key INDEX[2];
- BF_current INDEX.P[3] ^= BF_exp_key INDEX[3];
- BF_current INDEX.P[4] ^= BF_exp_key INDEX[4];
- BF_current INDEX.P[5] ^= BF_exp_key INDEX[5];
- BF_current INDEX.P[6] ^= BF_exp_key INDEX[6];
- BF_current INDEX.P[7] ^= BF_exp_key INDEX[7];
- BF_current INDEX.P[8] ^= BF_exp_key INDEX[8];
- BF_current INDEX.P[9] ^= BF_exp_key INDEX[9];
- BF_current INDEX.P[10] ^= BF_exp_key INDEX[10];
- BF_current INDEX.P[11] ^= BF_exp_key INDEX[11];
- BF_current INDEX.P[12] ^= BF_exp_key INDEX[12];
- BF_current INDEX.P[13] ^= BF_exp_key INDEX[13];
- BF_current INDEX.P[14] ^= BF_exp_key INDEX[14];
- BF_current INDEX.P[15] ^= BF_exp_key INDEX[15];
- BF_current INDEX.P[16] ^= BF_exp_key INDEX[16];
- BF_current INDEX.P[17] ^= BF_exp_key INDEX[17];
- }
+ BF_body();
+ } while (--count);
- BF_body();
+#if BF_mt == 1
+ for_each_ti() {
+ L0 = BF_magic_w[0];
+ R0 = BF_magic_w[1];
+
+ count = 64;
+ do {
+ BF_ENCRYPT(BF_current INDEX, L0, R0);
+ } while (--count);
- u1 = salt[0];
- u2 = salt[1];
- u3 = salt[2];
- u4 = salt[3];
- for_each_index() {
- BF_current INDEX.P[0] ^= u1;
- BF_current INDEX.P[1] ^= u2;
- BF_current INDEX.P[2] ^= u3;
- BF_current INDEX.P[3] ^= u4;
- BF_current INDEX.P[4] ^= u1;
- BF_current INDEX.P[5] ^= u2;
- BF_current INDEX.P[6] ^= u3;
- BF_current INDEX.P[7] ^= u4;
- BF_current INDEX.P[8] ^= u1;
- BF_current INDEX.P[9] ^= u2;
- BF_current INDEX.P[10] ^= u3;
- BF_current INDEX.P[11] ^= u4;
- BF_current INDEX.P[12] ^= u1;
- BF_current INDEX.P[13] ^= u2;
- BF_current INDEX.P[14] ^= u3;
- BF_current INDEX.P[15] ^= u4;
- BF_current INDEX.P[16] ^= u1;
- BF_current INDEX.P[17] ^= u2;
+ BF_out INDEX0[0] = L0;
+ BF_out INDEX0[1] = R0;
}
+#else
+ for_each_ti() {
+ BF_word L, R;
+ BF_word u1, u2, u3, u4;
+ BF_word count;
+ int i;
+
+ memcpy(&BF_out[index], &BF_magic_w, sizeof(BF_out[index]));
+
+ count = 64;
+ do
+ for (i = 0; i < 6; i += 2) {
+ L = BF_out[index][i];
+ R = BF_out[index][i + 1];
+ BF_ENCRYPT(BF_current INDEX2, L, R);
+ BF_out[index][i] = L;
+ BF_out[index][i + 1] = R;
+ } while (--count);
- BF_body();
- } while (--count);
-
- for_each_index() {
- L0 = BF_magic_w[0];
- R0 = BF_magic_w[1];
-
- count = 64;
- do {
- BF_ENCRYPT(BF_current INDEX, L0, R0);
- } while (--count);
-
- BF_out INDEX0[0] = L0;
- BF_out INDEX0[1] = R0;
+/* This has to be bug-compatible with the original implementation :-) */
+ BF_out[index][5] &= ~(BF_word)0xFF;
+ }
+#endif
}
}
void BF_std_crypt_exact(int index)
{
+#if BF_mt == 1
BF_word L, R;
BF_word u1, u2, u3, u4;
BF_word count;
@@ -702,6 +775,7 @@ void BF_std_crypt_exact(int index)
/* This has to be bug-compatible with the original implementation :-) */
BF_out[index][5] &= ~(BF_word)0xFF;
+#endif
}
/*
diff -urp john-1.7.5/src/BF_std.h john-1.7.5-omp-1/src/BF_std.h
--- john-1.7.5/src/BF_std.h 2008-06-21 23:24:37 +0000
+++ john-1.7.5-omp-1/src/BF_std.h 2010-05-08 13:38:32 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2008 by Solar Designer
+ * Copyright (c) 1996-2001,2008,2010 by Solar Designer
*/
/*
@@ -25,10 +25,16 @@ typedef BF_word BF_salt[4 + 1];
*/
typedef BF_word BF_binary[6];
+#ifdef _OPENMP
+#define BF_mt 24
+#else
+#define BF_mt 1
+#endif
+
#if BF_X2
-#define BF_N 2
+#define BF_N (2 * BF_mt)
#else
-#define BF_N 1
+#define BF_N (BF_mt)
#endif
/*
diff -urp john-1.7.5/src/Makefile john-1.7.5-omp-1/src/Makefile
--- john-1.7.5/src/Makefile 2009-12-17 19:11:03 +0000
+++ john-1.7.5-omp-1/src/Makefile 2010-05-08 13:47:59 +0000
@@ -1,6 +1,6 @@
#
# This file is part of John the Ripper password cracker,
-# Copyright (c) 1996-2009 by Solar Designer
+# Copyright (c) 1996-2010 by Solar Designer
#
CC = gcc
@@ -15,9 +15,9 @@ SED = sed
PERL = perl
NULL = /dev/null
CPPFLAGS = -E
-CFLAGS = -c -Wall -O2 -fomit-frame-pointer
+CFLAGS = -c -Wall -O2 -fomit-frame-pointer -fopenmp
ASFLAGS = -c
-LDFLAGS = -s
+LDFLAGS = -s -fopenmp
OPT_NORMAL = -funroll-loops
OPT_INLINE = -finline-functions
@@ -441,9 +441,9 @@ solaris-x86-64-cc:
$(MAKE) $(PROJ) \
JOHN_OBJS="$(JOHN_OBJS_MINIMAL) x86-64.o" \
CC=cc \
- CFLAGS="-c -fast -xarch=native64" \
+ CFLAGS="-c -fast -xarch=native64 -xopenmp" \
ASFLAGS="-c -xarch=native64" \
- LDFLAGS="-s -xarch=native64 -lrt" \
+ LDFLAGS="-s -xarch=native64 -xopenmp -lrt" \
OPT_NORMAL="" \
OPT_INLINE="-xinline=s1,s2,s3,s4,s5,s6,s7,s8"
diff -urp john-1.7.5/src/bench.c john-1.7.5-omp-1/src/bench.c
--- john-1.7.5/src/bench.c 2009-09-09 04:10:40 +0000
+++ john-1.7.5-omp-1/src/bench.c 2010-05-08 13:39:44 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2003,2004,2006,2008,2009 by Solar Designer
+ * Copyright (c) 1996-2001,2003,2004,2006,2008-2010 by Solar Designer
*/
#ifdef __ultrix__
@@ -158,6 +158,7 @@ char *benchmark_format(struct fmt_main *
start_real = times(&buf);
start_virtual = buf.tms_utime + buf.tms_stime;
+ start_virtual += buf.tms_cutime + buf.tms_cstime;
count = 0;
index = salts;
@@ -182,6 +183,7 @@ char *benchmark_format(struct fmt_main *
end_real = times(&buf);
end_virtual = buf.tms_utime + buf.tms_stime;
+ end_virtual += buf.tms_cutime + buf.tms_cstime;
if (end_virtual == start_virtual) end_virtual++;
results->real = end_real - start_real;
diff -urp john-1.7.5/src/params.h john-1.7.5-omp-1/src/params.h
--- john-1.7.5/src/params.h 2010-02-24 18:20:16 +0000
+++ john-1.7.5-omp-1/src/params.h 2010-05-08 13:43:22 +0000
@@ -15,7 +15,7 @@
/*
* John's version number.
*/
-#define JOHN_VERSION "1.7.5"
+#define JOHN_VERSION "1.7.5-omp-1"
/*
* Notes to packagers of John for *BSD "ports", Linux distributions, etc.:
Powered by blists - more mailing lists
Powered by Openwall GNU/*/Linux -
Powered by OpenVZ