--- bf_kernel.cl- 2012-12-11 05:57:24.586695429 +0400 +++ bf_kernel.cl 2012-12-13 05:32:05.693500731 +0400 @@ -19,6 +19,8 @@ #define WAVEFRONT_SIZE 1 +#define S4_2_SIZE 128 + #define CHANNEL_INTERLEAVE NUM_CHANNELS*WAVEFRONT_SIZE #define pos_S(row,col)\ @@ -30,9 +32,47 @@ #define pos_S_local(row,col)\ _index_S_local + (row<<8) + col +#if 0 + tmp1 = (tmp1 < (256-S4_2_SIZE)) ? Sptr4[tmp1] : S4_2[tmp1 - (256-S4_2_SIZE)]; \ + + { \ + uint mask = (uint) -(int)((tmp1 < (256-S4_2_SIZE))); \ + tmp1 = (mask & Sptr4[tmp1]) | (~mask & S4_2[tmp1 - (256-S4_2_SIZE)]); \ + } \ + + { \ + uint mask = - (int) ((L & 0x80) != 0); \ + tmp1 = L & 0x7f; \ + tmp1 = (~mask & Sptr4[tmp1]) | (mask & S4_2[tmp1]); \ + } \ + + uint mask = (L & 0x80) ? 0xffffffffU : 0; \ + + tmp1 = !(L & 0x80) ? Sptr4[tmp1] : S4_2[tmp1]; \ + tmp1 = (L & 0x80) ? S4_2[tmp1] : Sptr4[tmp1]; \ + + tmp1 = L & 0x7f; \ + tmp1 = mask ? S4_2[tmp1] : Sptr4[tmp1]; \ + + tmp1 = mask & S4_2[tmp1]; \ + tmp1 += ~mask & Sptr4[tmp1]; \ + + tmp1 = L & 0x7f; \ + tmp1 = bitselect(Sptr4[tmp1], S4_2[tmp1], (uint) -(int) !!(L & 0x80)); \ + + tmp1 = L & 0x7f; \ + tmp1 = bitselect(Sptr4[tmp1], S4_2[tmp1], (uint) -(int) ((L & 0x80) != 0)); \ + + tmp1 = L & 0x7f; \ + tmp1 = bitselect(Sptr4[tmp1], S4_2[tmp1], (uint)((int)(L & 0x80) << 24 >> 31)); \ + +if ((L & 0xf5f5f500) == 0) printf("%08x %08x\n", tmp1, mask); \ + +#endif + #define BF_ROUND(ctx_S,ctx_P, L, R, N, tmp1, tmp2, tmp3, tmp4) \ - tmp1 = L & 0xff; \ - tmp1 = Sptr4[tmp1];\ + tmp1 = L & 0x7f; \ + tmp1 = bitselect(Sptr4[tmp1], S4_2[tmp1], (uint) -(int) !!(L & 0x80)); \ tmp2 = L >> 8; \ tmp3= (tmp2>>8);\ tmp4= tmp3>>8; \ @@ -98,7 +138,7 @@ BF_current_P[16]= L0;\ BF_current_P[17]= R0;\ \ - for(i = 0; i < 1024; i += 8) \ + for(i = 0; i < 1024-S4_2_SIZE; i += 8) \ { BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ Sptr[i] = L0;\ Sptr[i + 1] = R0;\ @@ -111,6 +151,20 @@ BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ Sptr[i + 6] = L0;\ Sptr[i + 7] = R0;\ + } \ + for(i = 0; i < S4_2_SIZE; i += 8) \ + { BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ + S4_2[i] = L0;\ + S4_2[i + 1] = R0;\ + BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ + S4_2[i + 2] = L0;\ + S4_2[i + 3] = R0;\ + BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ + S4_2[i + 4] = L0;\ + S4_2[i + 5] = R0;\ + BF_ENCRYPT(Sptr, BF_current_P, L0, R0);\ + S4_2[i + 6] = L0;\ + S4_2[i + 7] = R0;\ } __kernel void blowfish(constant uint *salt __attribute__((max_constant_size(16))), @@ -126,18 +180,21 @@ int _index_S=(index/(CHANNEL_INTERLEAVE))*(CHANNEL_INTERLEAVE)*1024 + index%(CHANNEL_INTERLEAVE); int _index_P = 18*index; - int _index_S_local=lid*1024; + int _index_S_local=lid*(1024-S4_2_SIZE); +// int _index_S_local=lid*1024; int i,j,tmp0; uint BF_key_exp[18]; uint BF_current_P[18]; - MAYBE_LOCAL uint S_Buffer[WORK_GROUP_SIZE*1024]; + MAYBE_LOCAL uint S_Buffer[WORK_GROUP_SIZE*(1024-S4_2_SIZE)]; +// MAYBE_LOCAL uint S_Buffer[WORK_GROUP_SIZE*1024]; MAYBE_LOCAL uint *Sptr = S_Buffer + _index_S_local; MAYBE_LOCAL uint *Sptr2 = Sptr + 256; MAYBE_LOCAL uint *Sptr3 = Sptr + 512; MAYBE_LOCAL uint *Sptr4 = Sptr + 768; + uint S4_2[S4_2_SIZE]; for(i=0;i<18;i++){ tmp0=BF_current_P_global [pos_P(i)]; @@ -145,10 +202,13 @@ BF_key_exp[i]=tmp0^P_box[i]; } - for(i=0;i<1024;i++){ + for(i=0;i<1024-S4_2_SIZE;i++){ j=i>>8; S_Buffer[pos_S_local(j,(i&0xff))] = S_box[i] ; } + for(i=0;i>8; L0 ^= salt[(BF_ROUNDS + 2) & 3]; R0 ^= salt[(BF_ROUNDS + 3) & 3]; @@ -176,7 +236,19 @@ BF_ENCRYPT(Sptr, BF_current_P, L0, R0); S_Buffer[pos_S_local(j,((i+2)&0xff))] = L0; S_Buffer[pos_S_local(j,((i+3)&0xff))] = R0; - + } + + for(i = 0; i < S4_2_SIZE; i += 4) { + L0 ^= salt[(BF_ROUNDS + 2) & 3]; + R0 ^= salt[(BF_ROUNDS + 3) & 3]; + BF_ENCRYPT(Sptr, BF_current_P, L0, R0); + S4_2[i] = L0; + S4_2[i+1] = R0; + L0 ^= salt[(BF_ROUNDS + 4) & 3]; + R0 ^= salt[(BF_ROUNDS + 5) & 3]; + BF_ENCRYPT(Sptr, BF_current_P, L0, R0); + S4_2[i+2] = L0; + S4_2[i+3] = R0; } @@ -247,9 +319,11 @@ for(i=0;i<18;i++) BF_current_P_global [pos_P(i)]=BF_current_P[i]; - for(i=0;i<1024;i++){ + for(i=0;i<1024-S4_2_SIZE;i++){ j=i>>8; BF_current_S[pos_S(j,(i&0xff))] = S_Buffer[pos_S_local(j,(i&0xff))] ; } - + for(i=0;i