![]() |
|
Message-ID: <cc6e296990663cdca418e853a914e66b@smtp.hushmail.com> Date: Mon, 15 Apr 2013 01:53:32 +0200 From: magnum <john.magnum@...hmail.com> To: john-dev@...ts.openwall.com Subject: Re: [patch] sse/xop implementation of raw-sha512 On 15 Apr, 2013, at 0:41 , magnum <john.magnum@...hmail.com> wrote: > With the fixed version I do get a 2x speedup on Intel, and this despite it's only SSE2 (because Apple's silly OSX assembler can't handle AVX and stuff). > > $ ../run/john -t -fo:raw-sha512* > Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE > Raw: 2890K c/s real, 2890K c/s virtual > > Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE > Raw: 1449K c/s real, 1449K c/s virtual > > ...and just over 2x on Bull: > > $ ../run/john -t -fo:raw-sha512* > Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE > Raw: 3813K c/s real, 3778K c/s virtual > > Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE > Raw: 1889K c/s real, 1889K c/s virtual I added OMP support. Scales well on Intel, but worse on AMD: magnum@...r-osx:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... DONE Raw: 2985K c/s real, 2985K c/s virtual Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... DONE Raw: 1499K c/s real, 1484K c/s virtual All 2 formats passed self-tests! Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (4xOMP) DONE Raw: 8011K c/s real, 2644K c/s virtual Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (4xOMP) DONE Raw: 4014K c/s real, 1238K c/s virtual All 2 formats passed self-tests! Benchmarking: Raw SHA-512 [128/128 SSE2 intrinsics 2x]... (8xOMP) DONE Raw: 8093K c/s real, 1574K c/s virtual Benchmarking: Raw SHA-512 [64/64 CommonCrypto]... (8xOMP) DONE Raw: 5079K c/s real, 843694 c/s virtual All 2 formats passed self-tests! The drop at 8x is because of it's HT and only 4 real cores. magnum@...l:src [bleeding-jumbo]$ for i in 1 4 8; do OMP_NUM_THREADS=$i ../run/john -t -fo:raw-sha512* ; done Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... DONE Raw: 3207K c/s real, 3207K c/s virtual Benchmarking: Raw SHA-512 [64/64 OpenSSL]... DONE Raw: 1822K c/s real, 1822K c/s virtual All 2 formats passed self-tests! Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (4xOMP) DONE Raw: 7012K c/s real, 1753K c/s virtual Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (4xOMP) DONE Raw: 5215K c/s real, 1316K c/s virtual All 2 formats passed self-tests! Benchmarking: Raw SHA-512 [128/128 XOP intrinsics 2x]... (8xOMP) DONE Raw: 8077K c/s real, 1008K c/s virtual Benchmarking: Raw SHA-512 [64/64 OpenSSL]... (8xOMP) DONE Raw: 7072K c/s real, 891813 c/s virtual All 2 formats passed self-tests! Maybe it can be tweaked. I already use a lower OMP_SCALE for XOP but I haven't looked much into it. This is committed already but here's a diff -w: diff --git a/src/rawSHA512_ng_fmt.c b/src/rawSHA512_ng_fmt.c index 6bcb3da..c58ada5 100644 --- a/src/rawSHA512_ng_fmt.c +++ b/src/rawSHA512_ng_fmt.c @@ -7,6 +7,15 @@ */ +#ifdef _OPENMP +#include <omp.h> +#if defined __XOP__ +#define OMP_SCALE 1024 /* AMD */ +#else +#define OMP_SCALE 2048 /* Intel */ +#endif +#endif + #include "arch.h" #ifdef MMX_COEF @@ -95,7 +104,7 @@ #define GATHER(x,y,z) \ { \ x = _mm_setzero_si128 (); \ - x = _mm_set_epi64x (y[1][z], y[0][z]); \ + x = _mm_set_epi64x (y[index + 1][z], y[index][z]); \ } #define S0(x) \ @@ -176,13 +185,25 @@ static struct fmt_tests tests[] = { {NULL} }; -#ifdef _MSC_VER -__declspec(align(16)) static uint64_t saved_key[VWIDTH][80]; -__declspec(align(16)) static uint64_t crypt_key[ 8][VWIDTH]; -#else -static uint64_t saved_key[VWIDTH][80] __attribute__ ((aligned(16))); -static uint64_t crypt_key[ 8][VWIDTH] __attribute__ ((aligned(16))); +static uint64_t (*saved_key)[80]; +static uint64_t *crypt_key[ 8]; + + +static void init(struct fmt_main *self) +{ + int i; +#ifdef _OPENMP + int omp_t; + + omp_t = omp_get_max_threads(); + self->params.min_keys_per_crypt *= omp_t; + omp_t *= OMP_SCALE; + self->params.max_keys_per_crypt *= omp_t; #endif + saved_key = mem_calloc_tiny(sizeof(*saved_key) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD); + for (i = 0; i < 8; i++) + crypt_key[i] = mem_calloc_tiny(sizeof(uint64_t) * self->params.max_keys_per_crypt, MEM_ALIGN_SIMD); +} static inline void alter_endianity_64 (void *_x, unsigned int size) @@ -306,6 +327,16 @@ static int crypt_all (int *pcount, struct db_salt *salt) static void crypt_all (int count) #endif { +#if FMT_MAIN_VERSION > 10 + int count = *pcount; +#endif + int index = 0; + +#ifdef _OPENMP +#pragma omp parallel for + for (index = 0; index < count; index += 2) +#endif + { int i; __m128i a, b, c, d, e, f, g, h; @@ -419,17 +450,18 @@ static void crypt_all (int count) g = _mm_add_epi64 (g, _mm_set1_epi64x (0x1f83d9abfb41bd6b)); h = _mm_add_epi64 (h, _mm_set1_epi64x (0x5be0cd19137e2179)); - _mm_store_si128 ((__m128i *) crypt_key[0], a); - _mm_store_si128 ((__m128i *) crypt_key[1], b); - _mm_store_si128 ((__m128i *) crypt_key[2], c); - _mm_store_si128 ((__m128i *) crypt_key[3], d); - _mm_store_si128 ((__m128i *) crypt_key[4], e); - _mm_store_si128 ((__m128i *) crypt_key[5], f); - _mm_store_si128 ((__m128i *) crypt_key[6], g); - _mm_store_si128 ((__m128i *) crypt_key[7], h); + _mm_store_si128 ((__m128i *) &crypt_key[0][index], a); + _mm_store_si128 ((__m128i *) &crypt_key[1][index], b); + _mm_store_si128 ((__m128i *) &crypt_key[2][index], c); + _mm_store_si128 ((__m128i *) &crypt_key[3][index], d); + _mm_store_si128 ((__m128i *) &crypt_key[4][index], e); + _mm_store_si128 ((__m128i *) &crypt_key[5][index], f); + _mm_store_si128 ((__m128i *) &crypt_key[6][index], g); + _mm_store_si128 ((__m128i *) &crypt_key[7][index], h); + } #if FMT_MAIN_VERSION > 10 - return *pcount; + return count; #endif } @@ -438,7 +470,11 @@ static int cmp_all (void *binary, int count) { int i; +#ifdef _OPENMP + for (i=0; i < count; i++) +#else for (i=0; i < 2; i++) +#endif if (((uint64_t *) binary)[0] == crypt_key[0][i]) return 1; @@ -485,10 +521,10 @@ struct fmt_main fmt_rawSHA512_ng = { #endif MIN_KEYS_PER_CRYPT, MAX_KEYS_PER_CRYPT, - FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE, + FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE | FMT_OMP, tests }, { - fmt_default_init, + init, #if FMT_MAIN_VERSION > 10 fmt_default_done, fmt_default_reset, magnum
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.