commit 13f2e365b4a831b5ba97c4fd91cf16f1b4004aa1 Author: Myrice Date: Tue Mar 27 23:14:33 2012 +0800 Use shared memory in rawsha256.cu Signed-off-by: Myrice diff --git a/src/cuda/rawsha256.cu b/src/cuda/rawsha256.cu index 2afa44a..1b03e34 100644 --- a/src/cuda/rawsha256.cu +++ b/src/cuda/rawsha256.cu @@ -47,7 +47,7 @@ static void cuda_rawsha256(sha256_password *host_in,void *out) } -__global__ void kernel_sha256(sha256_password *data,SHA_HASH* data_out){ /// todo - use shared memory +__global__ void kernel_sha256(sha256_password *data,SHA_HASH* data_out){ /// todo - avoid bank conflict uint32_t idx = blockIdx.x*blockDim.x + threadIdx.x; const uint32_t k[]={ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, @@ -59,7 +59,6 @@ __global__ void kernel_sha256(sha256_password *data,SHA_HASH* data_out){ /// tod 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; uint32_t w[64]={0}; - SHA_HASH* out=&data_out[idx]; #pragma unroll 64 for(uint32_t j=0;j<64;j++){ @@ -75,11 +74,26 @@ __global__ void kernel_sha256(sha256_password *data,SHA_HASH* data_out){ /// tod uint32_t t2=Sigma0(a)+Maj(a,b,c); h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2; } - out->v[0]=a+H[0];out->v[1]=b+H[1]; - out->v[2]=c+H[2];out->v[3]=d+H[3]; - out->v[4]=e+H[4];out->v[5]=f+H[5]; - out->v[6]=g+H[6]; + __shared__ uint32_t sdata[1024]; + uint32_t bits_size=7; #ifdef SHA256 - out->v[7]=h+H[7]; + bits_size = 8; #endif -} \ No newline at end of file + sdata[0+threadIdx.x*bits_size] = a+H[0]; + sdata[1+threadIdx.x*bits_size] = b+H[1]; + sdata[2+threadIdx.x*bits_size] = c+H[2]; + sdata[3+threadIdx.x*bits_size] = d+H[3]; + sdata[4+threadIdx.x*bits_size] = e+H[4]; + sdata[5+threadIdx.x*bits_size] = f+H[5]; + sdata[6+threadIdx.x*bits_size] = g+H[6]; + #ifdef SHA256 + sdata[7+threadIdx.x*bits_size] = h+H[7]; + #endif + __syncthreads(); + uint32_t* out = (uint32_t*)data_out; + uint32_t b_off = (blockDim.x*bits_size)*blockIdx.x; + for(uint32_t i = 0; i < bits_size; ++i) { + uint32_t t_off = blockDim.x*i+threadIdx.x; + out[b_off+t_off] = sdata[t_off]; + } +}