[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri, 25 Feb 2011 11:26:46 +0300
From: Solar Designer <solar@...nwall.com>
To: john-users@...ts.openwall.com
Subject: Re: bitslice DES on AVX
Hi,
Here's another update. This time, I've added make targets for 32-bit
Linux systems (linux-x86-avx and linux-x86-xop). Like before, they're
tested under Intel's Software Development Emulator - the AVX one works
(approx. 40% slower than it does with linux-x86-64-avx under 64-bit
Linux on the same machine), the XOP one crashes on vpcmov with "Illegal
instruction", as expected.
Make targets for non-Linux systems may be added easily, but I did not
bother doing it yet.
The new patch (john-1.7.6-avx-3.diff) is attached to this message and
uploaded to the wiki:
http://openwall.info/wiki/john/patches
More detailed announcement of the previous revision (most of what I
wrote there still applies):
http://www.openwall.com/lists/john-users/2011/02/23/10
If you try this out on a real CPU or on a XOP emulator, please post your
results to john-users. Seriously, with 1000+ subscribers we ought to
have someone with an AVX-capable CPU already. These are available for
purchase since early January. Anyone?
A week ago, Phoronix published benchmarks of JtR 1.7.3.1 on Core i7
2820QM mobile "Sandy Bridge" CPU clocked at 2.3 GHz (perhaps with Turbo
Boost to a higher frequency). According to those, JtR does over
3.8M c/s on a single core with its pre-generated SSE2 assembly code
(which is why there's almost no difference across C compilers):
http://www.phoronix.com/scan.php?page=article&item=intel_snb_llvm&num=2
This is quite impressive. I wonder how a CPU like this will perform
when we go from SSE2 instructions (2-op 128-bit) to AVX (3-op 256-bit
or 128-bit).
Thanks,
Alexander
diff -urp john-1.7.6.orig/src/DES_bs_b.c john-1.7.6/src/DES_bs_b.c
--- john-1.7.6.orig/src/DES_bs_b.c 2010-06-13 18:17:29 +0000
+++ john-1.7.6/src/DES_bs_b.c 2011-02-23 07:45:53 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2001,2003,2010 by Solar Designer
+ * Copyright (c) 1996-2001,2003,2010,2011 by Solar Designer
*/
#include "arch.h"
@@ -127,14 +127,56 @@ typedef struct {
(dst).f = vec_sel((a).f, (b).f, (c).f); \
(dst).g = vec_sel((a).g, (b).g, (c).g)
+#elif defined(__AVX__) && DES_BS_DEPTH == 256
+#undef DES_BS_VECTOR
+
+#include <immintrin.h>
+
+/* Not __m256i because bitwise ops are "floating-point" with AVX */
+typedef __m256 vtype;
+
+#define vst(dst, ofs, src) \
+ _mm256_store_ps((float *)((DES_bs_vector *)&(dst) + (ofs)), (src))
+
+#define vxorf(a, b) \
+ _mm256_xor_ps((a), (b))
+
+#define vnot(dst, a) \
+ (dst) = _mm256_xor_ps((a), *(vtype *)&DES_bs_all.ones)
+#define vand(dst, a, b) \
+ (dst) = _mm256_and_ps((a), (b))
+#define vor(dst, a, b) \
+ (dst) = _mm256_or_ps((a), (b))
+#define vandn(dst, a, b) \
+ (dst) = _mm256_andnot_ps((b), (a))
+#define vxorn(dst, a, b) \
+ (dst) = _mm256_xor_ps(_mm256_xor_ps((a), (b)), \
+ *(vtype *)&DES_bs_all.ones)
+
+#ifdef __XOP__
+#define vnor(dst, a, b) \
+ (dst) = _mm256_xor_ps(_mm256_or_ps((a), (b)), \
+ *(vtype *)&DES_bs_all.ones)
+/* This could be _mm256_cmov_ps(), but it does not exist (yet?) */
+#define vsel(dst, a, b, c) \
+ (dst) = __builtin_ia32_vpcmov_v8sf256((b), (a), (c))
+#endif
+
#elif defined(__SSE2__) && DES_BS_DEPTH == 128
#undef DES_BS_VECTOR
-#ifdef __GNUC__
+#if defined(__GNUC__) && !defined(__AVX__)
#warning Notice: with recent versions of gcc, we are currently using SSE2 intrinsics instead of the supplied SSE2 assembly code. This choice is made in the x86-*.h file.
#endif
+#ifdef __AVX__
+#include <immintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+#else
#include <emmintrin.h>
+#endif
typedef __m128i vtype;
@@ -156,6 +198,14 @@ typedef __m128i vtype;
(dst) = _mm_xor_si128(_mm_xor_si128((a), (b)), \
*(vtype *)&DES_bs_all.ones)
+#ifdef __XOP__
+#define vnor(dst, a, b) \
+ (dst) = _mm_xor_si128(_mm_or_si128((a), (b)), \
+ *(vtype *)&DES_bs_all.ones)
+#define vsel(dst, a, b, c) \
+ (dst) = _mm_cmov_si128((b), (a), (c))
+#endif
+
#elif defined(__SSE2__) && defined(__MMX__) && DES_BS_DEPTH == 192 && \
!defined(DES_BS_NO_MMX)
#undef DES_BS_VECTOR
diff -urp john-1.7.6.orig/src/Makefile john-1.7.6/src/Makefile
--- john-1.7.6.orig/src/Makefile 2010-06-13 21:12:37 +0000
+++ john-1.7.6/src/Makefile 2011-02-25 07:19:33 +0000
@@ -1,6 +1,6 @@
#
# This file is part of John the Ripper password cracker,
-# Copyright (c) 1996-2010 by Solar Designer
+# Copyright (c) 1996-2011 by Solar Designer
#
CC = gcc
@@ -73,12 +73,16 @@ default:
@echo "To build John the Ripper, type:"
@echo " make clean SYSTEM"
@echo "where SYSTEM can be one of the following:"
- @echo "linux-x86-64 Linux, x86-64 with SSE2 (best)"
+ @echo "linux-x86-64 Linux, x86-64 with SSE2 (best tested)"
+ @echo "linux-x86-64-avx Linux, x86-64 with AVX (experimental)"
+ @echo "linux-x86-64-xop Linux, x86-64 with AVX and XOP (experimental)"
# @echo "linux-x86-64-32-sse2 Linux, x86-64, 32-bit with SSE2"
# @echo "linux-x86-64-32-mmx Linux, x86-64, 32-bit with MMX"
- @echo "linux-x86-sse2 Linux, x86 with SSE2 (best if 32-bit)"
- @echo "linux-x86-mmx Linux, x86 with MMX"
- @echo "linux-x86-any Linux, x86"
+ @echo "linux-x86-sse2 Linux, x86 32-bit with SSE2 (best tested if 32-bit)"
+ @echo "linux-x86-mmx Linux, x86 32-bit with MMX (for old computers)"
+ @echo "linux-x86-any Linux, x86 32-bit (for truly ancient computers)"
+ @echo "linux-x86-avx Linux, x86 32-bit with AVX (experimental)"
+ @echo "linux-x86-xop Linux, x86 32-bit with AVX and XOP (experimental)"
@echo "linux-alpha Linux, Alpha"
@echo "linux-sparc Linux, SPARC 32-bit"
@echo "linux-ppc32-altivec Linux, PowerPC w/AltiVec (best)"
@@ -150,6 +154,20 @@ linux-x86-64:
CFLAGS="$(CFLAGS) -DHAVE_CRYPT" \
LDFLAGS="$(LDFLAGS) -lcrypt"
+linux-x86-64-avx:
+ $(LN) x86-64.h arch.h
+ $(MAKE) $(PROJ) \
+ JOHN_OBJS="$(JOHN_OBJS) c3_fmt.o" \
+ CFLAGS="$(CFLAGS) -mavx -DHAVE_CRYPT" \
+ LDFLAGS="$(LDFLAGS) -lcrypt"
+
+linux-x86-64-xop:
+ $(LN) x86-64.h arch.h
+ $(MAKE) $(PROJ) \
+ JOHN_OBJS="$(JOHN_OBJS) c3_fmt.o" \
+ CFLAGS="$(CFLAGS) -mxop -DHAVE_CRYPT" \
+ LDFLAGS="$(LDFLAGS) -lcrypt"
+
linux-x86-64-32-sse2:
$(LN) x86-sse.h arch.h
$(MAKE) $(PROJ) \
@@ -187,6 +205,20 @@ linux-x86-any:
CFLAGS="$(CFLAGS) -DHAVE_CRYPT" \
LDFLAGS="$(LDFLAGS) -lcrypt"
+linux-x86-avx:
+ $(LN) x86-sse.h arch.h
+ $(MAKE) $(PROJ) \
+ JOHN_OBJS="$(JOHN_OBJS) c3_fmt.o x86.o" \
+ CFLAGS="$(CFLAGS) -mavx -DHAVE_CRYPT" \
+ LDFLAGS="$(LDFLAGS) -lcrypt"
+
+linux-x86-xop:
+ $(LN) x86-sse.h arch.h
+ $(MAKE) $(PROJ) \
+ JOHN_OBJS="$(JOHN_OBJS) c3_fmt.o x86.o" \
+ CFLAGS="$(CFLAGS) -mxop -DHAVE_CRYPT" \
+ LDFLAGS="$(LDFLAGS) -lcrypt"
+
linux-alpha:
$(LN) alpha.h arch.h
$(MAKE) $(PROJ) \
diff -urp john-1.7.6.orig/src/params.h john-1.7.6/src/params.h
--- john-1.7.6.orig/src/params.h 2010-06-14 02:38:55 +0000
+++ john-1.7.6/src/params.h 2011-02-25 07:32:15 +0000
@@ -15,7 +15,7 @@
/*
* John's version number.
*/
-#define JOHN_VERSION "1.7.6"
+#define JOHN_VERSION "1.7.6-avx-3"
/*
* Notes to packagers of John for *BSD "ports", Linux distributions, etc.:
Only in john-1.7.6/src: sde-log.txt
diff -urp john-1.7.6.orig/src/x86-64.h john-1.7.6/src/x86-64.h
--- john-1.7.6.orig/src/x86-64.h 2010-06-13 00:33:38 +0000
+++ john-1.7.6/src/x86-64.h 2011-02-25 07:13:38 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 2003,2006,2008,2010 by Solar Designer
+ * Copyright (c) 2003,2006,2008,2010,2011 by Solar Designer
*/
/*
@@ -32,7 +32,30 @@
#define DES_SCALE 1
#define DES_EXTB 1
#define DES_COPY 0
-#if defined(__SSE2__) && \
+#define DES_BS 1
+#ifdef __AVX__
+#define DES_BS_ASM 0
+#if 1
+#define DES_BS_VECTOR 4
+#if defined(__XOP__) && defined(__GNUC__)
+/* Require gcc for 256-bit XOP because of __builtin_ia32_vpcmov_v8sf256() */
+#undef DES_BS
+#define DES_BS 3
+#define DES_BS_ALGORITHM_NAME "256/256 BS XOP"
+#else
+#define DES_BS_ALGORITHM_NAME "256/256 BS AVX"
+#endif
+#else
+#define DES_BS_VECTOR 2
+#ifdef __XOP__
+#undef DES_BS
+#define DES_BS 3
+#define DES_BS_ALGORITHM_NAME "128/256 BS XOP"
+#else
+#define DES_BS_ALGORITHM_NAME "128/256 BS AVX"
+#endif
+#endif
+#elif defined(__SSE2__) && \
((__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __GNUC__ > 4)
#define DES_BS_ASM 0
#if 1
@@ -56,7 +79,6 @@
#define DES_BS_VECTOR 2
#define DES_BS_ALGORITHM_NAME "128/128 BS SSE2-16"
#endif
-#define DES_BS 1
#define DES_BS_EXPAND 1
#define MD5_ASM 0
diff -urp john-1.7.6.orig/src/x86-sse.h john-1.7.6/src/x86-sse.h
--- john-1.7.6.orig/src/x86-sse.h 2010-06-09 20:03:53 +0000
+++ john-1.7.6/src/x86-sse.h 2011-02-25 07:27:32 +0000
@@ -1,6 +1,6 @@
/*
* This file is part of John the Ripper password cracker,
- * Copyright (c) 1996-2002,2005,2006,2008,2010 by Solar Designer
+ * Copyright (c) 1996-2002,2005,2006,2008,2010,2011 by Solar Designer
*/
/*
@@ -45,7 +45,30 @@
#define DES_EXTB 0
#define DES_COPY 1
#define DES_STD_ALGORITHM_NAME "48/64 4K MMX"
-#if defined(__SSE2__) && 0
+#define DES_BS 1
+#ifdef __AVX__
+#define DES_BS_ASM 0
+#if 1
+#define DES_BS_VECTOR 8
+#if defined(__XOP__) && defined(__GNUC__)
+/* Require gcc for 256-bit XOP because of __builtin_ia32_vpcmov_v8sf256() */
+#undef DES_BS
+#define DES_BS 3
+#define DES_BS_ALGORITHM_NAME "256/256 BS XOP"
+#else
+#define DES_BS_ALGORITHM_NAME "256/256 BS AVX"
+#endif
+#else
+#define DES_BS_VECTOR 4
+#ifdef __XOP__
+#undef DES_BS
+#define DES_BS 3
+#define DES_BS_ALGORITHM_NAME "128/256 BS XOP"
+#else
+#define DES_BS_ALGORITHM_NAME "128/256 BS AVX"
+#endif
+#endif
+#elif defined(__SSE2__) && 0
#define DES_BS_ASM 0
#if 1
#define DES_BS_VECTOR 4
@@ -68,7 +91,6 @@
#define DES_BS_VECTOR 4
#define DES_BS_ALGORITHM_NAME "128/128 BS SSE2"
#endif
-#define DES_BS 1
#define DES_BS_EXPAND 1
#define MD5_ASM 1
Powered by blists - more mailing lists
Powered by Openwall GNU/*/Linux -
Powered by OpenVZ