Openwall GNU/*/Linux - a small security-enhanced Linux distro for servers
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 3 Sep 2015 18:15:29 +0300
From: Solar Designer <solar@...nwall.com>
To: john-dev@...ts.openwall.com
Subject: MD5 I() (was: SHA-1 H())

On Wed, Sep 02, 2015 at 06:52:25PM +0300, Solar Designer wrote:
> Attached to this message is a program I used to search for possible
> optimized expressions like this.  [...]  I was hoping
> it might find a 2 operation expression for MD5's I(), but no luck.

I've enhanced the program, and had better luck today:

$ ./search3 y n n n 2>&1 | cut -d' ' -f7- | sort -u
165
sel(ff, 55, 0f) = f5; 33 ^ f5 = c6;
sel(ff, 55, 0f) = f5; f5 ^ 33 = c6;

#define I(x, y, z)	(bitselect(0xffffffffU, (x), (z)) ^ (y))

To remind, the original was:

#define I(x, y, z)	((y) ^ ((x) | ~(z)))

I think it's the first time MD5 I() has been shown to be implementable
with only 2 operations on architectures without OR-NOT (and without
XNOR, and of course also without "ternary" logic instructions such as
Maxwell's or AVX-512's, which would turn this into 1 operation with no
effort).

Now that I think of it, the expression is actually very simple and I
should have been able to arrive at it without a program.  bitselect()
with the all-ones constant is directly usable to implement OR-NOT. :-)

> It doesn't yet test two bitselect()'s per expression, though - this is
> worth adding and trying again (many possibilities to test there).

It does now, and it also tests constants as you can see.  New version
attached.  And here's a table produced with it:

$ ./search3.sh
SEL     XNOR    ORN     ANDN    COUNT   MD5_I
yes     yes     yes     yes     190     yes
yes     yes     yes     no      190     yes
yes     yes     no      yes     190     yes
yes     yes     no      no      178     yes
yes     no      yes     yes     177     yes
yes     no      yes     no      177     yes
yes     no      no      yes     177     yes
yes     no      no      no      165     yes
no      yes     yes     yes     144     yes
no      yes     yes     no      114     yes
no      yes     no      yes     114     yes
no      yes     no      no      72      no
no      no      yes     yes     131     yes
no      no      yes     no      95      yes
no      no      no      yes     95      no
no      no      no      no      59      no

This shows the number of different truth tables possible for 3 inputs
with at most 2 operations on different architectures.  (It is assumed
that AND, OR, NOT and constants are always available, so only possible
instruction set extensions are listed in the table.)  The theoretical
maximum (actually achieved with Maxwell's or AVX-512's "ternary" logic
instructions) is 256.

The last column shows whether MD5's I() is implementable with 2
operations or not.  Similar tables can be generated for other
expressions, such as those found in other MD4/MD5, SHA-1, and SHA-2
basic functions - but for those we readily knew seemingly optimal
expressions using bitselect(), so I focused on MD5's I() for now.

Unfortunately, the program became rather long.  I'm sure it can be
greatly simplified - and perhaps it should in fact be rewritten and
tested against this version in order for us to have greater confidence
it actually finds all possible truth tables rather than only a (very
large) subset.

As to machine code improvements from this change to I(), here's GCN
assembly from our md5crypt kernel.  Before the change:

  v_not_b32     v5, v7                                      // 00003DA8: 7E0A6F07
  v_or_b32      v5, v1, v5                                  // 00003DAC: 380A0B01
  v_xor_b32     v5, v2, v5                                  // 00003DB0: 3A0A0B02
  v_add_i32     v4, vcc, v13, v4                            // 00003DB4: 4A08090D
  v_add_i32     v4, vcc, v5, v4                             // 00003DB8: 4A080905
  v_add_i32     v4, vcc, 0xbd3af235, v4                     // 00003DBC: 4A0808FF BD3AF235
  v_alignbit_b32  v4, v4, v4, 22                            // 00003DC4: D29C0004 025A0904
  v_add_i32     v4, vcc, v1, v4                             // 00003DCC: 4A080901

After the change (surprisingly, register allocation and offsets look
unchanged):

  v_bfi_b32     v5, v7, v1, -1                              // 00003DA8: D2940005 03060307
  v_xor_b32     v5, v2, v5                                  // 00003DB0: 3A0A0B02
  v_add_i32     v4, vcc, v13, v4                            // 00003DB4: 4A08090D
  v_add_i32     v4, vcc, v5, v4                             // 00003DB8: 4A080905
  v_add_i32     v4, vcc, 0xbd3af235, v4                     // 00003DBC: 4A0808FF BD3AF235
  v_alignbit_b32  v4, v4, v4, 22                            // 00003DC4: D29C0004 025A0904
  v_add_i32     v4, vcc, v1, v4                             // 00003DCC: 4A080901

That's 7 instructions instead of 8.  However, there's no code size
reduction in bytes, since v_bfi_b32 occupies 8 bytes (and it does so
even when no immediate value operand is used).

The effect of this will need to be tested across multiple OpenCL kernels
and pieces of C+intrinsics code across multiple architectures.  Besides
GPUs, also on AMD CPUs with XOP.

Alexander

[ CONTENT OF TYPE application/x-sh SKIPPED ]

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define Ch(x, y, z) ((x & y) ^ (~x & z))
#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))

#define f(x, y, z) ((y) ^ ((x) | ~(z)))
//#define f(x, y, z) ((x & y) | (z & (x | y)))
//#define f(x, y, z) Ch(x, y, z)
//#define f(x, y, z) Maj(x, y, z)

#define sel(x, y, z) ((x) ^ ((z) & ((y) ^ (x))))

static int no_sel, no_xnor, no_orn, no_andn;
static char how[0x100];

static void reorder(unsigned int a[3], unsigned int r)
{
	unsigned int x = a[0], y = a[1], z = a[2];
	switch (r) {
		case 1:
			a[0] = x;
			a[1] = z;
			a[2] = y;
			break;
		case 2:
			a[0] = y;
			a[1] = x;
			a[2] = z;
			break;
		case 3:
			a[0] = y;
			a[1] = z;
			a[2] = x;
			break;
		case 4:
			a[0] = z;
			a[1] = x;
			a[2] = y;
			break;
		case 5:
			a[0] = z;
			a[1] = y;
			a[2] = x;
			break;
	}
}

static unsigned int save_op(unsigned int x, unsigned int y, unsigned int r,
    char *op)
{
	sprintf(how + strlen(how), " %02x %s %02x = %02x;",
	    x & 0xff, op, y & 0xff, r & 0xff);
	return r;
}

static unsigned int save_sel(unsigned int x, unsigned int y, unsigned int z)
{
	unsigned int r = sel(x, y, z);
	sprintf(how + strlen(how), " sel(%02x, %02x, %02x) = %02x;",
	    x & 0xff, y & 0xff, z & 0xff, r & 0xff);
	return r;
}

static unsigned int op(unsigned int x, unsigned int y, unsigned int z,
    unsigned int o)
{
	switch (o) {
		case 0:
			if (no_sel) return 0xdead;
			return save_sel(x, y, z);
		case 1:
			return 0;
		case 2:
			return 0xff;
		case 3:
			return x;
		case 4:
			return ~x;
		case 5:
			return save_op(x, y, x ^ y, "^");
		case 6:
			return save_op(x, y, x | y, "|");
		case 7:
			return save_op(x, y, x & y, "&");
		case 8:
			if (no_xnor) return 0xdead;
			return save_op(x, y, x ^ ~y, "^~");
		case 9:
			if (no_orn) return 0xdead;
			return save_op(x, y, x | ~y, "|~");
		case 10:
			if (no_andn) return 0xdead;
			return save_op(x, y, x & ~y, "&~");
	}

	abort();
}

int main(int argc, char **argv)
{
	unsigned int i, j, k, m, n, o;
	unsigned int diff = 0;
	char seen[0x100] = {0};

	if (argc == 5) {
		no_sel = argv[1][0] == 'n';
		no_xnor = argv[2][0] == 'n';
		no_orn = argv[3][0] == 'n';
		no_andn = argv[4][0] == 'n';
	}

	for (o = 0; o <= 10; o++)
	for (n = 0; n <= 8; n++)
#ifdef DUPES
	for (m = 0; m <= 12; m++)
#else
	for (m = 0; m <= 6; m++)
#endif
	for (k = 0; k <= 75; k++)
	for (j = 0; j < 6; j++)
	for (i = 0; i < 4; i++) {
		unsigned int x = 0x55, y = 0x33, z = 0x0f;
		unsigned int a[4] = {0xdead, x, y, z};

		how[0] = 0;

		reorder(&a[1], j);

#ifdef DUPES
		if (m >= 7 && m <= 9)
			a[m - 6] = 0;
		else if (m >= 10)
			a[m - 9] = 0xff;
#endif

		if (!i) {
			a[0] = op(a[1], a[2], a[3], o);
			if (a[0] == 0xdead) continue;
		}

#ifndef DUPES
		if ((k != 25 && k != 26) || (i != m && i != m - 3)) {
#endif
			if (m >= 1 && m <= 3)
				a[m] = 0;
			else if (m >= 4 && m <= 6)
				a[m - 3] = 0xff;
#ifndef DUPES
		}
#endif

#ifdef DUPES
#define maybe_continue
#else
#define maybe_continue continue
#endif

		if (k == 27 && i) maybe_continue;

		unsigned int *which = &a[i];
		switch (k) {
			case 0:
				*which = op(*which, 0, 0, 4);
				break;
			case 1:
			case 2:
			case 3:
				if (which == &a[k]) maybe_continue;
				*which = op(*which, a[k], 0, 5);
				break;
			case 4:
			case 5:
			case 6:
				if (which == &a[k - 3]) maybe_continue;
				*which = op(*which, a[k - 3], 0, 6);
				break;
			case 7:
			case 8:
			case 9:
				if (which == &a[k - 6]) maybe_continue;
				*which = op(*which, a[k - 6], 0, 7);
				break;
			case 10:
			case 11:
			case 12:
				if (no_xnor) continue;
				if (which == &a[k - 9]) maybe_continue;
				*which = op(*which, a[k - 9], 0, 8);
				break;
			case 13:
			case 14:
			case 15:
				if (no_orn) continue;
				if (which == &a[k - 12]) maybe_continue;
				*which = op(*which, a[k - 12], 0, 9);
				break;
#ifdef DUPES
			case 16:
			case 17:
			case 18:
				if (no_orn) continue;
				if (which == &a[k - 15]) maybe_continue;
				*which = op(*which, a[k - 15], 0, 9);
				break;
#endif
			case 19:
			case 20:
			case 21:
				if (no_andn) continue;
				if (which == &a[k - 18]) maybe_continue;
				*which = op(*which, a[k - 18], 0, 10);
				break;
#ifdef DUPES
			case 22:
			case 23:
			case 24:
				if (no_andn) continue;
				if (which == &a[k - 21]) maybe_continue;
				*which = op(*which, a[k - 21], 0, 11);
				break;
			case 25:
				maybe_continue;
				if (which == &a[0]) maybe_continue;
				*which = 0;
				break;
			case 26:
				maybe_continue;
				if (which == &a[0]) maybe_continue;
				*which = 0xff;
				break;
			case 27:
				/* nop */
				break;
#endif
			default:
				if (k < 28) continue;
				if (no_sel) continue;
				if (which == &a[0]) maybe_continue;
				if (k < 34) {
					maybe_continue;
					unsigned int b[3] = {a[1], a[2], a[3]};
					reorder(b, k - 28);
					*which = save_sel(b[0], b[1], b[2]);
				} else if (k < 40) {
					maybe_continue;
					if (!m) maybe_continue;
					unsigned int b[3] = {x, y, z};
					reorder(b, j);
					reorder(b, k - 34);
					*which = save_sel(b[0], b[1], b[2]);
				} else {
					if (!m) maybe_continue;
					unsigned int b[3] = {a[1], a[2], a[3]};
					reorder(b, (k - 40) % 6);
					a[1] = x; a[2] = y; a[3] = z;
					reorder(&a[1], (k - 40) / 6);
					*which = save_sel(b[0], b[1], b[2]);
				}
		}

		if (n && i) {
			unsigned int *d[2];
			switch (i) {
				case 1:
					d[0] = &a[2];
					d[1] = &a[3];
					break;
				case 2:
					d[0] = &a[1];
					d[1] = &a[3];
					break;
				case 3:
					d[0] = &a[1];
					d[1] = &a[2];
					break;
			}
			switch ((n - 1) >> 1) {
				case 0:
					*d[n & 1] = *which;
					break;
				case 1:
					*d[n & 1] = x;
					break;
				case 2:
					*d[n & 1] = y;
					break;
				case 3:
					*d[n & 1] = z;
					break;
			}
		}

		if (which != &a[0])
			a[0] = op(a[1], a[2], a[3], o);

		if (a[0] == 0xdead) continue;

		a[0] &= 0xff;

		if (!seen[a[0]]) {
			seen[a[0]] = 1;
			diff++;
		}

#ifdef TRUTH
		printf("%02x\n", a[0]);
#endif

		if (a[0] != (f(x, y, z) & 0xff)) continue;

		fprintf(stderr, "%u %u %u %u %u %u%s\n",
		    o, n, m, k, j, i, how);
	}

	printf("%u\n", diff);

	return 0;
}

Powered by blists - more mailing lists

Your e-mail address:

Powered by Openwall GNU/*/Linux - Powered by OpenVZ