diff -ruN src/Makefile src_nt/Makefile --- src/Makefile 2006-05-15 18:38:00.000000000 +0200 +++ src_nt/Makefile 2007-03-07 20:29:45.000000000 +0100 @@ -28,6 +28,7 @@ BF_fmt.o BF_std.o \ AFS_fmt.o \ LM_fmt.o \ + NT_fmt.o \ batch.o bench.o charset.o common.o compiler.o config.o cracker.o \ crc32.o external.o formats.o getopt.o idle.o inc.o john.o list.o \ loader.o logger.o math.o memory.o misc.o options.o params.o path.o \ diff -ruN src/NT_fmt.c src_nt/NT_fmt.c --- src/NT_fmt.c 1970-01-01 01:00:00.000000000 +0100 +++ src_nt/NT_fmt.c 2007-03-16 13:29:07.000000000 +0100 @@ -0,0 +1,613 @@ +/* NTLM patch for john (performance improvement) + * + * Written by Alain Espinosa in 2007 + * and placed in the public domain. + */ + +#include +#include "arch.h" +#include "misc.h" +#include "memory.h" +#include "common.h" +#include "formats.h" + +//Init values +#define INIT_A 0x67452301 +#define INIT_B 0xefcdab89 +#define INIT_C 0x98badcfe +#define INIT_D 0x10325476 + +#define SQRT_2 0x5a827999 +#define SQRT_3 0x6ed9eba1 + + +#define FORMAT_LABEL "nt" +#define FORMAT_NAME "NT MD4" + +#define BENCHMARK_COMMENT "" +#define BENCHMARK_LENGTH -1 + +#define PLAINTEXT_LENGTH 27 +#define CIPHERTEXT_LENGTH 36 + +static struct fmt_tests tests[] = { + {"$NT$b7e4b9022cd45f275334bbdb83bb5be5", "John the Ripper"}, + {"$NT$8846f7eaee8fb117ad06bdd830b7586c", "password"}, + {"$NT$0cb6948805f797bf2a82807973b89537", "test"}, + {"$NT$31d6cfe0d16ae931b73c59d7e0c089c0", ""}, + {NULL} +}; + +#define BINARY_SIZE 16 +#define SALT_SIZE 0 + +#if defined (NT_X86_64) + #define NT_NUM_KEYS 32 + + unsigned int nt_buffer8x[16*NT_NUM_KEYS] __attribute__ ((aligned(16))); + unsigned int output8x[4*NT_NUM_KEYS] __attribute__ ((aligned(16))); + + #define ALGORITHM_NAME "X86-64 SSE2 8x" + #define NT_CRYPT_FUN nt_crypt_all_x86_64 + extern void nt_crypt_all_x86_64(int count); +#elif defined (NT_SSE2) + #define NT_NUM_KEYS 40 + #define NT_NUM_KEYS1 8 + #define NT_NUM_KEYS4 32 + + unsigned int nt_buffer4x[64*NT_NUM_KEYS1] __attribute__ ((aligned(16))); + unsigned int output4x[16*NT_NUM_KEYS1] __attribute__ ((aligned(16))); + + unsigned int nt_buffer1x[16*NT_NUM_KEYS1]; + unsigned int output1x[4*NT_NUM_KEYS1]; + + #define ALGORITHM_NAME "X86 SSE2 5x" + #define NT_CRYPT_FUN nt_crypt_all_sse2 + extern void nt_crypt_all_sse2(int count); +#else + #define NT_NUM_KEYS 64 + unsigned int nt_buffer1x[16*NT_NUM_KEYS]; + unsigned int output1x[4*NT_NUM_KEYS]; + + #define ALGORITHM_NAME "Generic 1x" + #define NT_CRYPT_FUN nt_crypt_all_generic + static void nt_crypt_all_generic(int count) + { + unsigned int a; + unsigned int b; + unsigned int c; + unsigned int d; + unsigned int i=0; + + for(;i>29); + d = INIT_D+(INIT_C ^ (a & 0x77777777)) +nt_buffer1x[i*16+1];d=(d<<7 )|(d>>25); + c = INIT_C+(INIT_B ^ (d & (a ^ INIT_B))) +nt_buffer1x[i*16+2];c=(c<<11)|(c>>21); + b = INIT_B + (a ^ (c & (d ^ a))) +nt_buffer1x[i*16+3];b=(b<<19)|(b>>13); + + a += (d ^ (b & (c ^ d))) + nt_buffer1x[i*16+4] ;a = (a << 3 ) | (a >> 29); + d += (c ^ (a & (b ^ c))) + nt_buffer1x[i*16+5] ;d = (d << 7 ) | (d >> 25); + c += (b ^ (d & (a ^ b))) + nt_buffer1x[i*16+6] ;c = (c << 11) | (c >> 21); + b += (a ^ (c & (d ^ a))) + nt_buffer1x[i*16+7] ;b = (b << 19) | (b >> 13); + + a += (d ^ (b & (c ^ d))) + nt_buffer1x[i*16+8] ;a = (a << 3 ) | (a >> 29); + d += (c ^ (a & (b ^ c))) + nt_buffer1x[i*16+9] ;d = (d << 7 ) | (d >> 25); + c += (b ^ (d & (a ^ b))) + nt_buffer1x[i*16+10] ;c = (c << 11) | (c >> 21); + b += (a ^ (c & (d ^ a))) + nt_buffer1x[i*16+11] ;b = (b << 19) | (b >> 13); + + a += (d ^ (b & (c ^ d))) + nt_buffer1x[i*16+12] ;a = (a << 3 ) | (a >> 29); + d += (c ^ (a & (b ^ c))) + nt_buffer1x[i*16+13] ;d = (d << 7 ) | (d >> 25); + c += (b ^ (d & (a ^ b))) + nt_buffer1x[i*16+14] ;c = (c << 11) | (c >> 21); + b += (a ^ (c & (d ^ a)));b = (b << 19) | (b >> 13); + + /* Round 2 */ + a += ((b & (c | d)) | (c & d))+nt_buffer1x[i*16+0] +SQRT_2;a = (a<<3 ) | (a>>29); + d += ((a & (b | c)) | (b & c))+nt_buffer1x[i*16+4] +SQRT_2;d = (d<<5 ) | (d>>27); + c += ((d & (a | b)) | (a & b))+nt_buffer1x[i*16+8] +SQRT_2;c = (c<<9 ) | (c>>23); + b += ((c & (d | a)) | (d & a))+nt_buffer1x[i*16+12]+SQRT_2;b = (b<<13) | (b>>19); + + a += ((b & (c | d)) | (c & d))+nt_buffer1x[i*16+1] +SQRT_2;a = (a<<3 ) | (a>>29); + d += ((a & (b | c)) | (b & c))+nt_buffer1x[i*16+5] +SQRT_2;d = (d<<5 ) | (d>>27); + c += ((d & (a | b)) | (a & b))+nt_buffer1x[i*16+9] +SQRT_2;c = (c<<9 ) | (c>>23); + b += ((c & (d | a)) | (d & a))+nt_buffer1x[i*16+13]+SQRT_2;b = (b<<13) | (b>>19); + + a += ((b & (c | d)) | (c & d))+nt_buffer1x[i*16+2] +SQRT_2;a = (a<<3 ) | (a>>29); + d += ((a & (b | c)) | (b & c))+nt_buffer1x[i*16+6] +SQRT_2;d = (d<<5 ) | (d>>27); + c += ((d & (a | b)) | (a & b))+nt_buffer1x[i*16+10]+SQRT_2;c = (c<<9 ) | (c>>23); + b += ((c & (d | a)) | (d & a))+nt_buffer1x[i*16+14]+SQRT_2;b = (b<<13) | (b>>19); + + a += ((b & (c | d)) | (c & d))+nt_buffer1x[i*16+3] +SQRT_2;a = (a<<3 ) | (a>>29); + d += ((a & (b | c)) | (b & c))+nt_buffer1x[i*16+7] +SQRT_2;d = (d<<5 ) | (d>>27); + c += ((d & (a | b)) | (a & b))+nt_buffer1x[i*16+11]+SQRT_2;c = (c<<9 ) | (c>>23); + b += ((c & (d | a)) | (d & a)) +SQRT_2;b = (b<<13) | (b>>19); + + /* Round 3 */ + a += (d ^ c ^ b) + nt_buffer1x[i*16+0] + SQRT_3; a = (a << 3 ) | (a >> 29); + d += (c ^ b ^ a) + nt_buffer1x[i*16+8] + SQRT_3; d = (d << 9 ) | (d >> 23); + c += (b ^ a ^ d) + nt_buffer1x[i*16+4] + SQRT_3; c = (c << 11) | (c >> 21); + b += (a ^ d ^ c) + nt_buffer1x[i*16+12] + SQRT_3; b = (b << 15) | (b >> 17); + + a += (d ^ c ^ b) + nt_buffer1x[i*16+2] + SQRT_3; a = (a << 3 ) | (a >> 29); + d += (c ^ b ^ a) + nt_buffer1x[i*16+10] + SQRT_3; d = (d << 9 ) | (d >> 23); + c += (b ^ a ^ d) + nt_buffer1x[i*16+6] + SQRT_3; c = (c << 11) | (c >> 21); + b += (a ^ d ^ c) + nt_buffer1x[i*16+14] + SQRT_3; b = (b << 15) | (b >> 17); + + a += (d ^ c ^ b) + nt_buffer1x[i*16+1] + SQRT_3; a = (a << 3 ) | (a >> 29); + d += (c ^ b ^ a) + nt_buffer1x[i*16+9] + SQRT_3; d = (d << 9 ) | (d >> 23); + c += (b ^ a ^ d) + nt_buffer1x[i*16+5] + SQRT_3; c = (c << 11) | (c >> 21); + b += (a ^ d ^ c) + nt_buffer1x[i*16+13]; + + output1x[4*i+0]=a; + output1x[4*i+1]=b; + output1x[4*i+2]=c; + output1x[4*i+3]=d; + } + } +#endif + +unsigned int last_i[NT_NUM_KEYS]; +char saved_plain[32*NT_NUM_KEYS]; + +#define MIN_KEYS_PER_CRYPT NT_NUM_KEYS +#define MAX_KEYS_PER_CRYPT NT_NUM_KEYS + +static void fmt_NT_init(void) +{ + memset(last_i,0,4*NT_NUM_KEYS); +#if defined(NT_X86_64) + memset(nt_buffer8x,0,16*4*NT_NUM_KEYS); +#elif defined(NT_SSE2) + memset(nt_buffer4x,0,64*4*NT_NUM_KEYS1); + memset(nt_buffer1x,0,16*4*NT_NUM_KEYS1); +#else + memset(nt_buffer1x,0,16*4*NT_NUM_KEYS); +#endif +} + +static char * nt_split(char *ciphertext, int index) +{ + static char out[37]; + + if (!strncmp(ciphertext, "$NT$", 4)) + ciphertext += 4; + + out[0] = '$'; + out[1] = 'N'; + out[2] = 'T'; + out[3] = '$'; + + memcpy(&out[4], ciphertext, 32); + out[36] = 0; + + strlwr(&out[4]); + + return out; +} + +static int valid(char *ciphertext) +{ + char *pos; + + if (strncmp(ciphertext, "$NT$", 4)!=0) return 0; + + for (pos = &ciphertext[4]; atoi16[ARCH_INDEX(*pos)] != 0x7F; pos++); + + if (!*pos && pos - ciphertext == CIPHERTEXT_LENGTH) + return 1; + else + return 0; + +} + +static void *get_binary(char *ciphertext) +{ + static unsigned int out[4]; + unsigned int i=0; + unsigned int temp; + + ciphertext+=4; + for (; i<4; i++) + { + temp = (atoi16[ARCH_INDEX(ciphertext[i*8+0])])<<4; + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+1])]); + + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+2])])<<12; + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+3])])<<8; + + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+4])])<<20; + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+5])])<<16; + + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+6])])<<28; + temp |= (atoi16[ARCH_INDEX(ciphertext[i*8+7])])<<24; + + out[i]=temp; + } + + out[0] -= INIT_A; + out[1] -= INIT_B; + out[2] -= INIT_C; + out[3] -= INIT_D; + + out[1] = (out[1] >> 15) | (out[1] << 17); + out[1] -= SQRT_3 + (out[2] ^ out[3] ^ out[0]); + out[1] = (out[1] >> 15) | (out[1] << 17); + out[1] -= SQRT_3; + + return out; +} + +static int binary_hash_0(void *binary) +{ + return ((unsigned int *)binary)[1] & 0x0F; +} + +static int binary_hash_1(void *binary) +{ + return ((unsigned int *)binary)[1] & 0xFF; +} + +static int binary_hash_2(void *binary) +{ + return ((unsigned int *)binary)[1] & 0x0FFF; +} + +static int get_hash_0(int index) +{ +#if defined(NT_X86_64) + return output8x[32*(index>>3)+8+index%8] & 0x0F; +#elif defined(NT_SSE2) + if(index>2)+4+index%4] & 0x0F; + else + return output1x[(index-NT_NUM_KEYS4)*4+1] & 0x0F; +#else + return output1x[(index<<2)+1] & 0x0F; +#endif +} + +static int get_hash_1(int index) +{ +#if defined(NT_X86_64) + return output8x[32*(index>>3)+8+index%8] & 0xFF; +#elif defined(NT_SSE2) + if(index>2)+4+index%4] & 0xFF; + else + return output1x[(index-NT_NUM_KEYS4)*4+1] & 0xFF; +#else + return output1x[(index<<2)+1] & 0xFF; +#endif +} + +static int get_hash_2(int index) +{ +#if defined(NT_X86_64) + return output8x[32*(index>>3)+8+index%8] & 0x0FFF; +#elif defined(NT_SSE2) + if(index>2)+4+index%4] & 0x0FFF; + else + return output1x[(index-NT_NUM_KEYS4)*4+1] & 0x0FFF; +#else + return output1x[(index<<2)+1] & 0x0FFF; +#endif +} + +static int cmp_all(void *binary, int count) +{ + unsigned int i=0; + unsigned int b=((unsigned int *)binary)[1]; + +#if defined(NT_X86_64) + for(;i<(NT_NUM_KEYS/2);i+=4) + if(b==output8x[i] || b==output8x[i+1] || b==output8x[i+2] || b==output8x[i+3] || b==output8x[i+4] || b==output8x[i+5] || b==output8x[i+6] || b==output8x[i+7]) + return 1; +#elif defined(NT_SSE2) + unsigned int pos=4; + + for(;i>3)+index%8; + + a=output8x[temp]; + b=output8x[temp+8]; + c=output8x[temp+16]; + d=output8x[temp+24]; + + pos1=24+index%8+128*(index>>3); + pos2=64+pos1; + pos3=32+pos1; +#elif defined(NT_SSE2) + int temp; + + if(index>2)+index%4; + + a=output4x[temp]; + b=output4x[temp+4]; + c=output4x[temp+8]; + d=output4x[temp+12]; + + pos1=12+index%4+64*(index>>2); + pos2=32+pos1; + pos3=16+pos1; + } + else + { + buffer=nt_buffer1x; + + temp=4*(index-NT_NUM_KEYS4); + + a=output1x[temp]; + b=output1x[temp+1]; + c=output1x[temp+2]; + d=output1x[temp+3]; + + pos1=3+4*temp; + pos2=8+pos1; + pos3=4+pos1; + } +#else + buffer=nt_buffer1x; + + a=output1x[(index<<2)]; + b=output1x[(index<<2)+1]; + c=output1x[(index<<2)+2]; + d=output1x[(index<<2)+3]; + + pos1=(index<<4)+3; + pos2=8+pos1; + pos3=4+pos1; +#endif + if(b!=t[1]) + return 0; + b += SQRT_3;b = (b << 15) | (b >> 17); + + a += (b ^ c ^ d) + buffer[pos1] + SQRT_3; a = (a << 3 ) | (a >> 29); + if(a!=t[0]) + return 0; + + d += (a ^ b ^ c) + buffer[pos2] + SQRT_3; d = (d << 9 ) | (d >> 23); + if(d!=t[3]) + return 0; + + c += (d ^ a ^ b) + buffer[pos3] + SQRT_3; c = (c << 11) | (c >> 21); + return c==t[2]; +} + +static int cmp_exact(char *source, int index) +{ + return 1; +} + +static void set_salt(void *salt) +{ +} + +static void set_key(char *key, int index) +{ + unsigned int i=0; + unsigned int md4_size=0; + unsigned int saved_base=index<<5; + unsigned int temp; + int buff_base; +#if defined(NT_X86_64) + unsigned int last_lenght=last_i[index]<<2; + + buff_base=128*(index>>3)+index%8; + + for(;key[md4_size] && md4_size>2)+index%4; + + for(;key[md4_size] && md4_size>=1; + + for(;i<=last_lenght;i++) + nt_buffer1x[i+buff_base]=0; + + last_i[index]=md4_size>>1; + + nt_buffer1x[14+buff_base] = md4_size << 4; + } +#else + buff_base=index<<4; + + for(;key[md4_size] && md4_size>1; + + nt_buffer1x[buff_base+14] = md4_size << 4; +#endif +} + +static char *get_key(int index) +{ + return saved_plain+(index<<5); +} + +struct fmt_main fmt_NT = { + { + FORMAT_LABEL, + FORMAT_NAME, + ALGORITHM_NAME, + BENCHMARK_COMMENT, + BENCHMARK_LENGTH, + PLAINTEXT_LENGTH, + BINARY_SIZE, + SALT_SIZE, + MIN_KEYS_PER_CRYPT, + MAX_KEYS_PER_CRYPT, + FMT_CASE | FMT_8_BIT | FMT_SPLIT_UNIFIES_CASE, + tests + }, { + fmt_NT_init, + valid, + nt_split, + get_binary, + fmt_default_salt, + { + binary_hash_0, + binary_hash_1, + binary_hash_2 + }, + fmt_default_salt_hash, + set_salt, + set_key, + get_key, + fmt_default_clear_keys, + NT_CRYPT_FUN, + { + get_hash_0, + get_hash_1, + get_hash_2 + }, + cmp_all, + cmp_one, + cmp_exact + } +}; diff -ruN src/john.c src_nt/john.c --- src/john.c 2006-05-08 16:48:48.000000000 +0200 +++ src_nt/john.c 2007-02-21 22:49:51.000000000 +0100 @@ -38,6 +38,7 @@ extern struct fmt_main fmt_DES, fmt_BSDI, fmt_MD5, fmt_BF; extern struct fmt_main fmt_AFS, fmt_LM; +extern struct fmt_main fmt_NT; extern int unshadow(int argc, char **argv); extern int unafs(int argc, char **argv); @@ -64,6 +65,8 @@ john_register_one(&fmt_BF); john_register_one(&fmt_AFS); john_register_one(&fmt_LM); + john_register_one(&fmt_NT); + if (!fmt_list) { fprintf(stderr, "Unknown ciphertext format name requested\n"); diff -ruN src/loader.c src_nt/loader.c --- src/loader.c 2005-11-08 14:03:20.000000000 +0100 +++ src_nt/loader.c 2007-02-21 22:49:51.000000000 +0100 @@ -18,6 +18,7 @@ #include "signals.h" #include "formats.h" #include "loader.h" +#include "options.h" /* * Flags for read_file(). @@ -183,7 +184,7 @@ static int ldr_split_line(char **login, char **ciphertext, char **gecos, char **home, char *source, struct fmt_main **format, - struct db_options *options, char *line) + struct db_options *db_options, char *line) { char *uid = NULL, *gid = NULL, *shell = NULL; char *tmp; @@ -205,11 +206,32 @@ if (!strncmp(*ciphertext, "NO PASSWORD", 11)) *ciphertext = ""; + + /* NT loader hack starts here ! */ + + if (options.format && (strncmp(options.format, "nt", 2)==0)) { + + tmp = ldr_get_field(&line); + *ciphertext = tmp; + + if (!strncmp(*ciphertext, "NO PASSWORD", 11)) + *ciphertext = ""; + else if(strlen(*ciphertext) == 32) { + *ciphertext -= 4; + strncpy(*ciphertext,"$NT$",4); + } + else { + return 0; + } + + } + + /* NT loader hack ends here ! */ if (source) sprintf(source, "%s:%s", uid, line); } - if (options->flags & DB_WORDS || options->shells->head) { + if (db_options->flags & DB_WORDS || db_options->shells->head) { gid = ldr_get_field(&line); do { *gecos = ldr_get_field(&line); @@ -218,13 +240,13 @@ } while (!**gecos && !strcmp(*home, "0") && !strcmp(shell, "0")); } else - if (options->groups->head) { + if (db_options->groups->head) { gid = ldr_get_field(&line); } - if (ldr_check_list(options->users, *login, uid)) return 0; - if (ldr_check_list(options->groups, gid, gid)) return 0; - if (ldr_check_shells(options->shells, shell)) return 0; + if (ldr_check_list(db_options->users, *login, uid)) return 0; + if (ldr_check_list(db_options->groups, gid, gid)) return 0; + if (ldr_check_shells(db_options->shells, shell)) return 0; if (*format) return (*format)->methods.valid(*ciphertext); diff -ruN src/options.c src_nt/options.c --- src/options.c 2006-01-09 15:35:00.000000000 +0100 +++ src_nt/options.c 2007-02-21 22:49:52.000000000 +0100 @@ -60,7 +60,7 @@ {"salts", FLG_SALTS, FLG_SALTS, FLG_PASSWD, OPT_REQ_PARAM, "%d", &options.loader.min_pps}, {"format", FLG_FORMAT, FLG_FORMAT, - FLG_CRACKING_SUP, + 0, FLG_MAKECHR_CHK | FLG_STDOUT | OPT_REQ_PARAM, OPT_FMT_STR_ALLOC, &options.format}, {"save-memory", FLG_SAVEMEM, FLG_SAVEMEM, 0, OPT_REQ_PARAM, @@ -101,7 +101,7 @@ "--salts=[-]COUNT load salts with[out] at least COUNT passwords " \ "only\n" \ "--format=NAME force ciphertext format NAME: " \ - "DES/BSDI/MD5/BF/AFS/LM\n" \ + "DES/BSDI/MD5/BF/AFS/LM/NT\n" \ "--save-memory=LEVEL enable memory saving, at LEVEL 1..3\n" void opt_init(char *name, int argc, char **argv) diff -ruN src/x86-64.S src_nt/x86-64.S --- src/x86-64.S 2006-05-21 03:28:10.000000000 +0200 +++ src_nt/x86-64.S 2007-03-16 13:39:20.000000000 +0100 @@ -13,6 +13,9 @@ #define DES_bs_crypt _DES_bs_crypt #define DES_bs_crypt_25 _DES_bs_crypt_25 #define DES_bs_crypt_LM _DES_bs_crypt_LM +#define nt_buffer8x _nt_buffer8x +#define output8x _output8x +#define nt_crypt_all_x86_64 _nt_crypt_all_x86_64 #endif #ifdef ALIGN_LOG @@ -1040,3 +1043,232 @@ subl $1,rounds jnz DES_bs_crypt_LM_loop ret + + +/* +extern nt_crypt_all_x86_64(int count); +*/ + +.globl nt_crypt_all_x86_64 + +.data +.align(16) +const_init_a: +.long 0xFFFFFFFF +.long 0xFFFFFFFF +.long 0xFFFFFFFF +.long 0xFFFFFFFF +const_init_b: +.long 0xefcdab89 +.long 0xefcdab89 +.long 0xefcdab89 +.long 0xefcdab89 +const_init_c: +.long 0x98badcfe +.long 0x98badcfe +.long 0x98badcfe +.long 0x98badcfe +const_init_d: +.long 0x10325476 +.long 0x10325476 +.long 0x10325476 +.long 0x10325476 + +const_stage2: +.long 0x5a827999 +.long 0x5a827999 +.long 0x5a827999 +.long 0x5a827999 +const_stage3: +.long 0x6ed9eba1 +.long 0x6ed9eba1 +.long 0x6ed9eba1 +.long 0x6ed9eba1 + +#define a %xmm0 +#define b %xmm1 +#define c %xmm2 +#define d %xmm3 +#define t1 %xmm4 +#define t2 %xmm5 +#define t3 %xmm6 +#define t4 %xmm7 + +#define a3 %xmm8 +#define b3 %xmm9 +#define c3 %xmm10 +#define d3 %xmm11 +#define t13 %xmm12 +#define t23 %xmm13 + +/* +#define F(x, y, z) (z ^ (x & (y ^ z))) +#define G(x, y, z) ((x & (y | z)) | (y & z)) +#define H(x, y, z) (x ^ y ^ z) + +#define STEP(f, a, b, c, d, x, s) + a += f(b, c, d) + x; + a = (a << s) | (a >> (32 - s)); +*/ +#define STEP1(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (512*base)+(x*32)+nt_buffer8x, aa; \ + paddd (512*base)+(x*32)+16+nt_buffer8x, aa3; \ + movdqa cc, t1; \ + movdqa cc3, t13; \ + pxor dd, t1; \ + pxor dd3, t13; \ + pand bb, t1; \ + pand bb3, t13; \ + pxor dd, t1; \ + pxor dd3, t13; \ + paddd t1, aa; \ + paddd t13, aa3; \ + movdqa aa, t2; \ + movdqa aa3, t23; \ + pslld $s, aa; \ + pslld $s, aa3; \ + psrld $(32-s), t2; \ + psrld $(32-s), t23; \ + por t2, aa; \ + por t23, aa3; + +#define STEP2(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (512*base)+(x*32)+nt_buffer8x, aa; \ + paddd (512*base)+(x*32)+16+nt_buffer8x, aa3; \ + movdqa cc, t1; \ + movdqa cc3, t13; \ + movdqa cc, t2; \ + movdqa cc3, t23; \ + por dd, t1; \ + por dd3, t13; \ + pand dd, t2; \ + pand dd3, t23; \ + pand bb, t1; \ + pand bb3, t13; \ + paddd t3, aa; \ + paddd t3, aa3; \ + por t2, t1; \ + por t23, t13; \ + paddd t1, aa; \ + paddd t13, aa3; \ + movdqa aa, t1; \ + movdqa aa3, t13; \ + pslld $s, aa; \ + pslld $s, aa3; \ + psrld $(32-s), t1; \ + psrld $(32-s), t13; \ + por t1, aa; \ + por t13, aa3; + +#define STEP3(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (512*base)+(x*32)+nt_buffer8x, aa; \ + paddd (512*base)+(x*32)+16+nt_buffer8x, aa3; \ + movdqa dd, t1; \ + movdqa dd3, t13; \ + pxor cc, t1; \ + pxor cc3, t13; \ + paddd t4, aa; \ + paddd t4, aa3; \ + pxor bb, t1; \ + pxor bb3, t13; \ + paddd t1, aa; \ + paddd t13, aa3; \ + movdqa aa, t1; \ + movdqa aa3, t13; \ + pslld $s, aa; \ + pslld $s, aa3; \ + psrld $(32-s), t1; \ + psrld $(32-s), t13; \ + por t1, aa; \ + por t13, aa3; + +#define NT_CRYPT_BODY(base) \ + movdqa const_init_a, a; \ + movdqa const_init_a, a3; \ + movdqa const_init_b, b; \ + movdqa const_init_b, b3; \ + movdqa const_init_c, c; \ + movdqa const_init_c, c3; \ + movdqa const_init_d, d; \ + movdqa const_init_d, d3; \ + \ + paddd (512*base)+nt_buffer8x, a; \ + paddd (512*base)+16+nt_buffer8x, a3; \ + pslld $3, a; \ + pslld $3, a3; \ + \ + STEP1(d, a, b, c, d3, a3, b3, c3, 1 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 2 , 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 3 , 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 4 , 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 5 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 6 , 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 7 , 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 8 , 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 9 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 10, 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 11, 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 12, 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 13, 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 14, 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 15, 19, base) \ + \ + STEP2(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 4 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 8 , 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 12, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 5 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 9 , 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 13, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 6 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 10, 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 14, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 3 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 7 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 11, 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 15, 13, base) \ + \ + STEP3(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 8 , 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 4 , 11, base) \ + STEP3(b, c, d, a, b3, c3, d3, a3, 12, 15, base) \ + STEP3(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 10, 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 6 , 11, base) \ + STEP3(b, c, d, a, b3, c3, d3, a3, 14, 15, base) \ + STEP3(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 9 , 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 5 , 11, base) \ + movdqa a, t1; \ + movdqa a3, t13; \ + paddd (512*base)+416+nt_buffer8x, b; \ + paddd (512*base)+416+16+nt_buffer8x, b3; \ + pxor d, t1; \ + pxor d3,t13; \ + pxor c, t1; \ + pxor c3,t13; \ + paddd t1, b; \ + paddd t13,b3; \ + \ + movdqa a, (128*base)+output8x; \ + movdqa a3, (128*base)+16+output8x; \ + movdqa b, (128*base)+32+output8x; \ + movdqa b3, (128*base)+32+16+output8x; \ + movdqa c, (128*base)+64+output8x; \ + movdqa c3, (128*base)+64+16+output8x; \ + movdqa d, (128*base)+96+output8x; \ + movdqa d3, (128*base)+96+16+output8x; + + +nt_crypt_all_x86_64: + movdqa const_stage2, t3 + movdqa const_stage3, t4 + + NT_CRYPT_BODY(0) + NT_CRYPT_BODY(1) + NT_CRYPT_BODY(2) + NT_CRYPT_BODY(3) + + ret diff -ruN src/x86-64.h src_nt/x86-64.h --- src/x86-64.h 2006-05-15 18:38:00.000000000 +0200 +++ src_nt/x86-64.h 2007-03-15 16:24:26.000000000 +0100 @@ -45,4 +45,6 @@ #define BF_ASM 0 #define BF_SCALE 1 +#define NT_X86_64 + #endif diff -ruN src/x86-sse.S src_nt/x86-sse.S --- src/x86-sse.S 2006-05-10 07:50:52.000000000 +0200 +++ src_nt/x86-sse.S 2007-03-16 13:37:30.000000000 +0100 @@ -15,6 +15,11 @@ #define DES_bs_crypt _DES_bs_crypt #define DES_bs_crypt_25 _DES_bs_crypt_25 #define DES_bs_crypt_LM _DES_bs_crypt_LM +#define nt_crypt_all_sse2 _nt_crypt_all_sse2 +#define nt_buffer4x _nt_buffer4x +#define nt_buffer1x _nt_buffer1x +#define output4x _output4x +#define output1x _output1x #endif /* @@ -1289,3 +1294,233 @@ jnz DES_bs_crypt_LM_loop popl %esi ret + + + +/* +extern nt_crypt_all_sse2(int count); +*/ + +.globl nt_crypt_all_sse2 + +.data +.align(16) +const_init_a: +.long 0xFFFFFFFF +.long 0xFFFFFFFF +.long 0xFFFFFFFF +.long 0xFFFFFFFF +const_init_b: +.long 0xefcdab89 +.long 0xefcdab89 +.long 0xefcdab89 +.long 0xefcdab89 +const_init_c: +.long 0x98badcfe +.long 0x98badcfe +.long 0x98badcfe +.long 0x98badcfe +const_init_d: +.long 0x10325476 +.long 0x10325476 +.long 0x10325476 +.long 0x10325476 + +const_stage2: +.long 0x5a827999 +.long 0x5a827999 +.long 0x5a827999 +.long 0x5a827999 +const_stage3: +.long 0x6ed9eba1 +.long 0x6ed9eba1 +.long 0x6ed9eba1 +.long 0x6ed9eba1 + +#define a %xmm0 +#define b %xmm1 +#define c %xmm2 +#define d %xmm3 +#define t1 %xmm4 +#define t2 %xmm5 +#define t3 %xmm6 +#define t4 %xmm7 + +#define a3 %eax +#define b3 %ebx +#define c3 %ecx +#define d3 %edx +#define t13 %esi +#define t23 %edi +#define Q2 $0x5a827999 +#define Q3 $0x6ed9eba1 + +/* +#define F(x, y, z) (z ^ (x & (y ^ z))) +#define G(x, y, z) ((x & (y | z)) | (y & z)) +#define H(x, y, z) (x ^ y ^ z) + +#define STEP(f, a, b, c, d, x, s) + a += f(b, c, d) + x; + a = (a << s) | (a >> (32 - s)); +*/ +#define STEP1(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (256*base)+(x*16)+nt_buffer4x, aa; \ + add (64*base)+(x*4)+nt_buffer1x, aa3; \ + movdqa cc, t1; \ + mov cc3, t13; \ + pxor dd, t1; \ + xor dd3, t13; \ + pand bb, t1; \ + and bb3, t13; \ + pxor dd, t1; \ + xor dd3, t13; \ + paddd t1, aa; \ + add t13, aa3; \ + movdqa aa, t2; \ + rol $s, aa3; \ + pslld $s, aa; \ + psrld $(32-s), t2; \ + por t2, aa; + +#define STEP2(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (256*base)+(x*16)+nt_buffer4x, aa; \ + add (64*base)+(x*4)+nt_buffer1x, aa3; \ + movdqa cc, t1; \ + mov cc3, t13; \ + movdqa cc, t2; \ + mov cc3, t23; \ + por dd, t1; \ + or dd3, t13; \ + pand dd, t2; \ + and dd3, t23; \ + pand bb, t1; \ + and bb3, t13; \ + paddd t3, aa; \ + add Q2, aa3; \ + por t2, t1; \ + or t23, t13; \ + paddd t1, aa; \ + add t13, aa3; \ + movdqa aa, t1; \ + rol $s, aa3; \ + pslld $s, aa; \ + psrld $(32-s), t1; \ + por t1, aa; + +#define STEP3(aa, bb, cc, dd, aa3, bb3, cc3, dd3, x, s, base) \ + paddd (256*base)+(x*16)+nt_buffer4x, aa; \ + add (64*base)+(x*4)+nt_buffer1x, aa3; \ + movdqa dd, t1; \ + mov dd3, t13; \ + pxor cc, t1; \ + xor cc3, t13; \ + paddd t4, aa; \ + add Q3, aa3; \ + pxor bb, t1; \ + xor bb3, t13; \ + paddd t1, aa; \ + add t13, aa3; \ + movdqa aa, t1; \ + rol $s, aa3; \ + pslld $s, aa; \ + psrld $(32-s), t1; \ + por t1, aa; + +#define NT_CRYPT_BODY(base) \ + movdqa const_init_a, a; \ + mov const_init_a, a3; \ + movdqa const_init_b, b; \ + mov const_init_b, b3; \ + movdqa const_init_c, c; \ + mov const_init_c, c3; \ + movdqa const_init_d, d; \ + mov const_init_d, d3; \ + \ + paddd (256*base)+nt_buffer4x, a; \ + add (64*base)+nt_buffer1x, a3; \ + pslld $3, a; \ + rol $3, a3; \ + \ + STEP1(d, a, b, c, d3, a3, b3, c3, 1 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 2 , 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 3 , 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 4 , 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 5 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 6 , 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 7 , 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 8 , 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 9 , 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 10, 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 11, 19, base) \ + STEP1(a, b, c, d, a3, b3, c3, d3, 12, 3 , base) \ + STEP1(d, a, b, c, d3, a3, b3, c3, 13, 7 , base) \ + STEP1(c, d, a, b, c3, d3, a3, b3, 14, 11, base) \ + STEP1(b, c, d, a, b3, c3, d3, a3, 15, 19, base) \ + \ + STEP2(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 4 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 8 , 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 12, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 5 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 9 , 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 13, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 6 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 10, 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 14, 13, base) \ + STEP2(a, b, c, d, a3, b3, c3, d3, 3 , 3 , base) \ + STEP2(d, a, b, c, d3, a3, b3, c3, 7 , 5 , base) \ + STEP2(c, d, a, b, c3, d3, a3, b3, 11, 9 , base) \ + STEP2(b, c, d, a, b3, c3, d3, a3, 15, 13, base) \ + \ + STEP3(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 8 , 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 4 , 11, base) \ + STEP3(b, c, d, a, b3, c3, d3, a3, 12, 15, base) \ + STEP3(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 10, 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 6 , 11, base) \ + STEP3(b, c, d, a, b3, c3, d3, a3, 14, 15, base) \ + STEP3(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base) \ + STEP3(d, a, b, c, d3, a3, b3, c3, 9 , 9 , base) \ + STEP3(c, d, a, b, c3, d3, a3, b3, 5 , 11, base) \ + movdqa a, t1; \ + mov a3, t13; \ + paddd (256*base)+208+nt_buffer4x, b; \ + add (64*base)+52+nt_buffer1x, b3; \ + pxor d, t1; \ + xor d3,t13; \ + pxor c, t1; \ + xor c3,t13; \ + paddd t1, b; \ + add t13,b3; \ + \ + movdqa a, (64*base)+output4x; \ + mov a3, (16*base)+output1x; \ + movdqa b, (64*base)+16+output4x; \ + mov b3, (16*base)+4+output1x; \ + movdqa c, (64*base)+32+output4x; \ + mov c3, (16*base)+8+output1x; \ + movdqa d, (64*base)+48+output4x; \ + mov d3, (16*base)+12+output1x; + +nt_crypt_all_sse2: + pusha + + movdqa const_stage2, t3 + movdqa const_stage3, t4 + + NT_CRYPT_BODY(0) + NT_CRYPT_BODY(1) + NT_CRYPT_BODY(2) + NT_CRYPT_BODY(3) + NT_CRYPT_BODY(4) + NT_CRYPT_BODY(5) + NT_CRYPT_BODY(6) + NT_CRYPT_BODY(7) + + popa + + ret diff -ruN src/x86-sse.h src_nt/x86-sse.h --- src/x86-sse.h 2006-05-10 07:25:58.000000000 +0200 +++ src_nt/x86-sse.h 2007-03-07 20:30:57.000000000 +0100 @@ -58,4 +58,6 @@ #define BF_ASM 1 #define BF_SCALE 1 +#define NT_SSE2 + #endif