diff --git a/run/john.conf b/run/john.conf
index 23cec4c..7ed0ab7 100644
--- a/run/john.conf
+++ b/run/john.conf
@@ -86,8 +86,8 @@ Device = 0
 #rar_KPC = 8192
 
 # For Crypt sha-512.
-cryptsha512_LWS = 64
-cryptsha512_KPC = 8192
+#cryptsha512_LWS = 64
+#cryptsha512_KPC = 5120
 
 
 # A user defined character class is named with a single digit, ie. 0..9. After
diff --git a/src/common-opencl.c b/src/common-opencl.c
index f1b6cfa..f26c189 100644
--- a/src/common-opencl.c
+++ b/src/common-opencl.c
@@ -8,6 +8,8 @@
 static char opencl_log[LOG_SIZE];
 static char *kernel_source;
 static int kernel_loaded;
+static int device_info;
+static int cores_per_MP;
 
 void advance_cursor() {
   static int pos=0;
@@ -85,30 +87,20 @@ static void dev_init(unsigned int dev_id, unsigned int platform_id)
 	HANDLE_CLERROR(ret_code, "Error creating command queue");
 }
 
-cl_device_type get_device_type(int dev_id)
-{
-	cl_device_type type;
-	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE,
-	                               sizeof(cl_device_type), &type, NULL),
-	               "Error querying CL_DEVICE_TYPE");
-
-	return type;
-}
-
 static char * include_source(char *pathname, int dev_id)
 {
 	static char include[PATH_BUFFER_SIZE];
 
-	sprintf(include, "-I %s %s %s", path_expand(pathname),
+	sprintf(include, "-I %s %s %s%d %s", path_expand(pathname),
 	        get_device_type(dev_id) == CL_DEVICE_TYPE_CPU ?
 	        "-DDEVICE_IS_CPU" : "",
+                "-DDEVICE_INFO=", device_info,
 	        "-cl-strict-aliasing -cl-mad-enable");
 
 	//fprintf(stderr, "Options used: %s\n", include);
 	return include;
 }
 
-
 static void build_kernel(int dev_id)
 {
 	assert(kernel_loaded);
@@ -164,13 +156,54 @@ static void build_kernel(int dev_id)
 #endif
 }
 
+void opencl_get_dev_info(unsigned int dev_id)
+{
+        cl_device_type device;
+        
+        device = get_device_type(dev_id);
+        
+        if (device == CL_DEVICE_TYPE_CPU)
+                device_info = CPU;
+        else if (device == CL_DEVICE_TYPE_GPU)
+                device_info = GPU;
+        else if (device == CL_DEVICE_TYPE_ACCELERATOR)
+                device_info = ACCELERATOR;
+
+        device_info += get_vendor_id(dev_id);
+        device_info += get_processor_family(dev_id);
+}
+
+void opencl_init_dev(unsigned int dev_id, unsigned int platform_id)
+{
+	dev_init(dev_id, platform_id);        
+        opencl_get_dev_info(dev_id);
+}
+
+void opencl_build_kernel(char *kernel_filename, unsigned int dev_id)
+{
+	read_kernel_source(kernel_filename);
+	build_kernel(dev_id);
+}
+
 void opencl_init(char *kernel_filename, unsigned int dev_id,
                  unsigned int platform_id)
 {
-	//if (!kernel_loaded)
-		read_kernel_source(kernel_filename);
-		dev_init(dev_id, platform_id);
-	build_kernel(dev_id);
+        opencl_init_dev(dev_id, platform_id);
+        opencl_build_kernel(kernel_filename, dev_id);
+}
+
+int get_device_info(){
+    return device_info;
+}
+
+cl_device_type get_device_type(int dev_id)
+{
+	cl_device_type type;
+	HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_TYPE,
+	                               sizeof(cl_device_type), &type, NULL),
+	               "Error querying CL_DEVICE_TYPE");
+
+	return type;
 }
 
 cl_ulong get_local_memory_size(int dev_id)
@@ -194,6 +227,17 @@ size_t get_max_work_group_size(int dev_id)
         return max_group_size;
 }
 
+size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel) {
+    size_t max_group_size;
+
+    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id],
+            CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size),
+            &max_group_size, NULL),
+            "Error querying clGetKernelWorkGroupInfo");
+
+    return max_group_size;
+}
+
 cl_uint get_max_compute_units(int dev_id)
 {
         cl_uint size;
@@ -204,6 +248,95 @@ cl_uint get_max_compute_units(int dev_id)
         return size;
 }
 
+cl_uint get_processors_count(int dev_id)
+{
+        int major = 0, minor = 0;
+        cl_uint core_count = get_max_compute_units(dev_id);
+
+        if (gpu_nvidia(device_info)) {
+                //oclGetDevCap(devices[dev_id], &major, &minor);
+
+                if (major == 1)  
+                        core_count *= (cores_per_MP = 8);
+                else if (major == 2 && minor == 0) 
+                        core_count *= (cores_per_MP = 32);  //2.0
+                else if (major == 2 && minor >= 1) 
+                        core_count *= (cores_per_MP = 48);  //2.1 and up
+                else if (major == 3)
+                        core_count *= (cores_per_MP = 192); //3.0 and up
+                else  
+                        core_count *= (cores_per_MP = 192); //Future use
+
+                if (major == 9999 && minor == 9999)
+                        core_count = 0;
+        }
+        else if (gpu_amd(device_info)) {  
+                core_count *= 16 *   //16 thread processors * 5 SP 
+                        ((amd_gcn(device_info) || amd_vliw4(device_info)) ? 4 : 5); 
+        }
+        else if (gpu(device_info))  //Any other GPU
+                core_count *=8; 
+      
+        return core_count;
+}
+
+cl_uint get_processor_family(int dev_id)
+{       
+        char dname[MAX_OCLINFO_STRING_LEN];
+        
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_NAME,
+                sizeof(dname), dname, NULL),
+                "Error querying CL_DEVICE_NAME");
+
+        if gpu(device_info) {
+            
+                if (gpu_amd(device_info) && (
+                    strstr(dname, "Cedar") ||
+                    strstr(dname, "Redwood") ||
+                    strstr(dname, "Juniper") ||
+                    strstr(dname, "Cypress") ||
+                    strstr(dname, "Hemlock") ||
+                    strstr(dname, "Caicos") ||
+                    strstr(dname, "Turks") ||
+                    strstr(dname, "Barts") ||
+                    strstr(dname, "Cayman") ||
+                    strstr(dname, "Antilles") ||
+                    strstr(dname, "Wrestler") ||
+                    strstr(dname, "Zacate") ||
+                    strstr(dname, "WinterPark") ||
+                    strstr(dname, "BeaverCreek"))) {
+
+                        if (strstr(dname, "Cayman") ||
+                            strstr(dname, "Antilles"))
+                                return AMD_VLIW4;
+                        else
+                                return AMD_VLIW5;
+
+                } else
+                        return AMD_GCN + AMD_VLIW5;    
+        }
+        return UNKNOWN;
+}
+
+int get_vendor_id(int dev_id)
+{
+        char dname[MAX_OCLINFO_STRING_LEN];
+        
+        HANDLE_CLERROR(clGetDeviceInfo(devices[dev_id], CL_DEVICE_VENDOR, 
+                sizeof(dname), dname, NULL),
+                "Error querying CL_DEVICE_VENDOR");
+
+        if (strstr (dname, "NVIDIA") != NULL) 
+            return NVIDIA;
+
+        if (strstr (dname, "Advanced Micro") !=NULL ||
+            strstr (dname, "AMD") !=NULL ||
+            strstr (dname, "ATI") != NULL)  
+            return AMD;
+        
+        return UNKNOWN;
+}
+
 char *get_error_name(cl_int cl_error)
 {
 	static char *err_1[] =
@@ -264,7 +397,6 @@ char *megastring(unsigned long long value)
 	return outbuf;
 }
 
-#define MAX_OCLINFO_STRING_LEN	64
 void listOpenCLdevices(void) {
 	char dname[MAX_OCLINFO_STRING_LEN];
 	cl_uint num_platforms, num_devices, entries;
@@ -334,7 +466,10 @@ void listOpenCLdevices(void) {
 			clGetDeviceInfo(devices[d], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &p_size, NULL);
 			printf("\tMax Work Group Size:\t%d\n", (int)p_size);
 			clGetDeviceInfo(devices[d], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &entries, NULL);
-			printf("\tParallel compute cores:\t%d\n\n", entries);
+			printf("\tParallel compute cores:\t%d\n", entries);
+                        
+                        opencl_get_dev_info(d);
+                        printf("\tStream processors:\t%d\n\n", get_processors_count(d));
 		}
 	}
 	return;
diff --git a/src/common-opencl.h b/src/common-opencl.h
index 81b79b1..3ca7ec8 100644
--- a/src/common-opencl.h
+++ b/src/common-opencl.h
@@ -17,6 +17,7 @@
 #define MAXGPUS	8
 #define MAX_PLATFORMS	8
 #define SUBSECTION_OPENCL	":OpenCL"
+#define MAX_OCLINFO_STRING_LEN	64
 
 /* Comment if you do not want to see OpenCL warnings during kernel compilation */
 #define REPORT_OPENCL_WARNINGS
@@ -33,13 +34,44 @@ cl_kernel crypt_kernel;
 size_t local_work_size;
 size_t max_group_size;
 
+cl_int oclGetDevCap(cl_device_id device, cl_int *iComputeCapMajor, cl_int *iComputeCapMinor);
+
+void opencl_init_dev(unsigned int dev_id, unsigned int platform_id);
 void opencl_init(char *kernel_filename, unsigned int dev_id,
                  unsigned int platform_id);
+void opencl_build_kernel(char *kernel_filename, unsigned int dev_id);
 
+int get_device_info();
+cl_device_type get_device_type(int dev_id);
 cl_ulong get_local_memory_size(int dev_id);
 size_t get_max_work_group_size(int dev_id);
+size_t get_current_work_group_size(int dev_id, cl_kernel crypt_kernel);
 cl_uint get_max_compute_units(int dev_id);
-cl_device_type get_device_type(int dev_id);
+cl_uint get_processors_count(int dev_id);
+cl_uint get_processor_family(int dev_id);
+int get_vendor_id(int dev_id);
+
+#define UNKNOWN                 0
+#define CPU                     1
+#define GPU                     2
+#define ACCELERATOR             4
+#define AMD                     64
+#define NVIDIA                  128
+#define INTEL                   256
+#define AMD_GCN                 1024
+#define AMD_VLIW4               2048
+#define AMD_VLIW5               4096 
+        
+#define cpu(n)                  ((n & CPU) == (CPU))
+#define gpu(n)                  ((n & GPU) == (GPU))
+#define gpu_amd(n)              ((n & AMD) && gpu(n))
+#define gpu_amd_64(n)           (0)
+#define gpu_nvidia(n)           ((n & NVIDIA) && gpu(n))
+#define gpu_intel(n)            ((n & INTEL) && gpu(n))
+#define cpu_amd(n)              ((n & AMD) && cpu(n))
+#define amd_gcn(n)              ((n & AMD_GCN) && gpu_amd(n))
+#define amd_vliw4(n)            ((n & AMD_VLIW4) && gpu_amd(n))
+#define amd_vliw5(n)            ((n & AMD_VLIW5) && gpu_amd(n))
 
 char *get_error_name(cl_int cl_error);
 
diff --git a/src/opencl/cryptsha512_kernel.cl b/src/opencl/cryptsha512_kernel.cl
index be0ddcd..012b43e 100644
--- a/src/opencl/cryptsha512_kernel.cl
+++ b/src/opencl/cryptsha512_kernel.cl
@@ -2,6 +2,8 @@
  * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
  * Based on source code provided by Lukas Odzioba
  *
+ * More information at http://openwall.info/wiki/john/OpenCL-SHA-512
+ *
  * This software is:
  * Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
  * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
@@ -125,16 +127,10 @@ void sha512_block(__local sha512_ctx * ctx) {
     uint64_t t1, t2;
     uint64_t w[16];
 
-#ifdef DEVICE_IS_CPU
-    #pragma unroll 16
-    for (int i = 0; i < 16; i++)
-        w[i] = SWAP64(ctx->buffer->mem_64[i]);
-#else
     ulong16  w_vector;
     w_vector = vload16(0, ctx->buffer->mem_64); 
     w_vector = SWAP64(w_vector);
     vstore16(w_vector, 0, w);
-#endif
 
     #pragma unroll 80
     for (int i = 0; i < 80; i++) {
@@ -165,13 +161,24 @@ void sha512_block(__local sha512_ctx * ctx) {
 }
 
 void ctx_append_1(__local sha512_ctx * ctx) {
-    int i = 127 - ctx->buflen;
-    __local uint8_t * d = ctx->buffer->mem_08 + ctx->buflen;
+
+    int length = ctx->buflen;
+    int i = 127 - length;
+    __local uint8_t * d = ctx->buffer->mem_08 + length;
+    __local uint32_t * l;
 
     *d++ = 0x80;
 
-    while (i--) {
-        d[i] = 0;
+    while((++length % 4) != 0)
+    {
+	*d++ = 0;
+	i--;
+    }
+    l = (__local uint32_t*) d;
+
+    while (i > 0) {
+        i-= 4;
+        *l++ = 0;
     }
 }
 
@@ -204,10 +211,10 @@ void ctx_update(__local sha512_ctx * ctx,
 
 void clear_ctx_buffer(__local sha512_ctx * ctx) {
 
-    __local uint32_t *w = ctx->buffer->mem_32;
+    __local uint64_t *w = ctx->buffer->mem_64;
 
-    #pragma unroll 32
-    for (int i = 0; i < 32; i++)
+    //#pragma unroll 16
+    for (int i = 0; i < 16; i++)
         w[i] = 0;
 
     ctx->buflen = 0;
@@ -302,7 +309,7 @@ void sha512crypt(__local  working_memory    * fast_tmp_memory,
             ctx_update(&ctx, p_sequence->mem_08, passlen);
 
         ctx_update(&ctx, ((i & 1) != 0 ? alt_result->mem_08 : p_sequence->mem_08),
-                          ((i & 1) != 0 ? 64 :                 passlen));
+                         ((i & 1) != 0 ? 64 :                 passlen));
         sha512_digest(&ctx, alt_result->mem_64);
     }
     //Send results to the host.
@@ -315,11 +322,14 @@ void sha512crypt(__local  working_memory    * fast_tmp_memory,
 #undef rounds   
 #undef pass
 
-__kernel void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
-                           __global   crypt_sha512_password * pass_data,
-                           __global   crypt_sha512_hash   * out_buffer,
-                           __local    crypt_sha512_salt   * salt_data,
-                           __local    working_memory      * fast_tmp_memory) {
+__kernel
+// __attribute__((vec_type_hint(ulong2)))		Not recognized.
+// __attribute__((reqd_work_group_size(32, 1, 1)))	No gain.
+void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
+                  __global   crypt_sha512_password * pass_data,
+                  __global   crypt_sha512_hash   * out_buffer,
+                  __local    crypt_sha512_salt   * salt_data,
+                  __local    working_memory      * fast_tmp_memory) {
 
     //Get the task to be done
     size_t gid = get_global_id(0);
@@ -365,6 +375,7 @@ __kernel void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
 *   5%   Remove some unecessary code.
 *  ###   Move almost everything to global and local memory. BAD.
 *   1%   Use vector types in SHA_Block in some variables. 
+*   5%   Use bitselect in SHA_Block. 
 *
 * Conclusions
 * - Compare on GPU: CPU is more efficient for now.
@@ -376,4 +387,4 @@ __kernel void kernel_crypt(__constant crypt_sha512_salt     * informed_salt,
 *   so should not expect big gains anyway.
 *   If i have a lot of memory, i might solve more than one hash at once 
 *   (and use more vectors). But it is not possible (at least for a while).
-***/
\ No newline at end of file
+***/
diff --git a/src/opencl_cryptsha512.h b/src/opencl_cryptsha512.h
index 7799c0e..2409bbf 100644
--- a/src/opencl_cryptsha512.h
+++ b/src/opencl_cryptsha512.h
@@ -2,6 +2,8 @@
  * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
  * Based on source code provided by Lukas Odzioba
  *
+ * More information at http://openwall.info/wiki/john/OpenCL-SHA-512
+ * 
  * This software is:
  * Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
  * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
@@ -14,6 +16,29 @@
 #ifndef _CRYPTSHA512_H 
 #define _CRYPTSHA512_H
 
+//Copied from common-opencl.h
+#define UNKNOWN                 0
+#define CPU                     1
+#define GPU                     2
+#define ACCELERATOR             4
+#define AMD                     64
+#define NVIDIA                  128
+#define INTEL                   256
+#define AMD_GCN                 1024
+#define AMD_VLIW4               2048
+#define AMD_VLIW5               4096
+        
+#define cpu(n)                  ((n & CPU) == (CPU))
+#define gpu(n)                  ((n & GPU) == (GPU))
+#define gpu_amd(n)              ((n & AMD) && gpu(n))
+#define gpu_amd_64(n)           (0)
+#define gpu_nvidia(n)           ((n & NVIDIA) && gpu(n))
+#define gpu_intel(n)            ((n & INTEL) && gpu(n))
+#define cpu_amd(n)              ((n & AMD) && cpu(n))
+#define amd_gcn(n)              ((n & AMD_GCN) && gpu_amd(n))
+#define amd_vliw4(n)            ((n & AMD_VLIW4) && gpu_amd(n))
+#define amd_vliw5(n)            ((n & AMD_VLIW5) && gpu_amd(n))
+
 //Type names definition. 
 #define uint8_t  unsigned char
 #define uint16_t unsigned short
@@ -24,6 +49,7 @@
 #define MAX(x,y)                ((x) > (y) ? (x) : (y))
 #define MIN(x,y)                ((x) < (y) ? (x) : (y))
 
+//Constants.
 #define ROUNDS_DEFAULT          5000
 #define ROUNDS_MIN              1000
 #define ROUNDS_MAX              999999999
@@ -31,16 +57,31 @@
 #define SALT_SIZE               16
 #define PLAINTEXT_LENGTH        16
 #define BINARY_SIZE             (3+16+86)       ///TODO: Magic number?
+#define STEP	                512
 
-#define KEYS_PER_CORE_CPU       512
-#define KEYS_PER_CORE_GPU       1024
+#define KEYS_PER_CORE_CPU       128
+#define KEYS_PER_CORE_GPU       512
 #define MIN_KEYS_PER_CRYPT	128
-#define MAX_KEYS_PER_CRYPT	2048*2048*128
+#define MAX_KEYS_PER_CRYPT	2048*1024
+
+//Macros.
+#if gpu_amd_64(DEVICE_INFO)
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#define ror(x, n) 	amd_bitalign(x, x, (uint64_t) n)
+	#define Ch(x, y, z) 	amd_bytealign(x, y, z)
+	#define Maj(x, y, z) 	amd_bytealign(z ^ x, y, x )
+#elif gpu_amd(DEVICE_INFO)
+	#define Ch(x,y,z)	bitselect(z, y, x)
+	#define Maj(x,y,z)      bitselect(x, y, z ^ x)
+	#define ror(x, n) 	rotate(x, (uint64_t) 64-n)
+#elif gpu_nvidia(DEVICE_INFO)
+        #pragma OPENCL EXTENSION cl_nv_pragma_unroll : enable
+#else
+	#define Ch(x,y,z)	((x & y) ^ ( (~x) & z))
+	#define Maj(x,y,z)      ((x & y) ^ (x & z) ^ (y & z))
+        #define ror(x, n)       ((x >> n) | (x << (64-n)))
+#endif
 
-#define rol(x,n)                rotate(x, n) 
-#define ror(x,n)                rotate(x, (uint64_t) 64-n)
-#define Ch(x,y,z)               ((x & y) ^ ( (~x) & z))
-#define Maj(x,y,z)              ((x & y) ^ (x & z) ^ (y & z))
 #define Sigma0(x)               ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39)))
 #define Sigma1(x)               ((ror(x,14)) ^ (ror(x,18)) ^ (ror(x,41)))
 #define sigma0(x)               ((ror(x,1))  ^ (ror(x,8))  ^ (x>>7))
@@ -93,4 +134,4 @@ typedef struct {
         buffer_64               temp_result[8];
         buffer_64               p_sequence[8];
 } working_memory;
-#endif
\ No newline at end of file
+#endif
diff --git a/src/opencl_cryptsha512_fmt.c b/src/opencl_cryptsha512_fmt.c
index e08e08d..f10e431 100644
--- a/src/opencl_cryptsha512_fmt.c
+++ b/src/opencl_cryptsha512_fmt.c
@@ -2,6 +2,8 @@
  * Developed by Claudio André <claudio.andre at correios.net.br> in 2012   
  * Based on source code provided by Samuele Giovanni Tonon
  *
+ * More information at http://openwall.info/wiki/john/OpenCL-SHA-512
+ * 
  * Copyright (c) 2011 Samuele Giovanni Tonon <samu at linuxasylum dot net>
  * Copyright (c) 2012 Claudio André <claudio.andre at correios.net.br>
  * This program comes with ABSOLUTELY NO WARRANTY; express or implied .
@@ -11,7 +13,7 @@
  */
 
 #include <string.h>
-#include "common-opencl.h"  
+#include "common-opencl.h"
 #include "config.h"
 #include "opencl_cryptsha512.h"
 
@@ -50,26 +52,14 @@ static struct fmt_tests tests[] = {
     {NULL}
 }; 
 
-size_t get_current_work_group_size(int dev_id)
-{
-        size_t max_group_size;
-
-        HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[dev_id],
-                       CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_group_size), 
-                       &max_group_size, NULL),
-                       "Error querying clGetKernelWorkGroupInfo");
-
-        return max_group_size;
-}
-
 /* ------- Helper functions ------- */
 unsigned int get_task_max_work_group_size(){
     unsigned int max_available;
     max_available = get_local_memory_size(gpu_id) /
             (sizeof(working_memory) + sizeof(crypt_sha512_salt));
 
-    if (max_available > get_current_work_group_size(gpu_id))
-        return get_current_work_group_size(gpu_id);
+    if (max_available > get_current_work_group_size(gpu_id, crypt_kernel))
+        return get_current_work_group_size(gpu_id, crypt_kernel);
     
     return max_available;
 }
@@ -77,8 +67,8 @@ unsigned int get_task_max_work_group_size(){
 unsigned int get_task_max_size(){ 
     unsigned int max_available;
     max_available = get_max_compute_units(gpu_id);
-
-    if (get_device_type(gpu_id) == CL_DEVICE_TYPE_CPU)
+            
+    if (cpu(get_device_info()))
         return max_available * KEYS_PER_CORE_CPU;
     
     return max_available * KEYS_PER_CORE_GPU;
@@ -165,6 +155,50 @@ static void release_clobj(void) {
     HANDLE_CLERROR(ret_code, "Error Releasing pinned_partial_hashes");
 }
 
+/* ------- Salt functions ------- */
+static void *get_salt(char *ciphertext) {
+    int end = 0, i, len = strlen(ciphertext);
+    for (i = len - 1; i >= 0; i--)
+        if (ciphertext[i] == '$') {
+            end = i;
+            break;
+        }
+
+    static unsigned char ret[50];
+    for (i = 0; i < end; i++)
+        ret[i] = ciphertext[i];
+    ret[end] = 0;
+    return (void *) ret;
+}
+
+static void set_salt(void *salt_info) {    
+    int len = strlen(salt_info);
+    unsigned char offset = 0;
+    static char currentsalt[64];
+    
+    memcpy(currentsalt, (char *) salt_info, len + 1);
+    salt.rounds = ROUNDS_DEFAULT;
+
+    if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0)
+        offset += 3;
+
+    if (strncmp((char *) currentsalt + offset, (char *) "rounds=", 7) == 0) {
+        const char *num = currentsalt + offset + 7;
+        char *endp;
+        unsigned long int srounds = strtoul(num, &endp, 10);
+
+        if (*endp == '$') {
+            endp += 1;
+            salt.rounds =
+                    MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX));
+        }
+        offset = endp - currentsalt;
+    }
+    memcpy(salt.salt, currentsalt + offset, SALT_SIZE);
+    salt.length = strlen((char *) salt.salt);
+    salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length);
+}
+
 /* ------- Key functions ------- */
 static void set_key(char *key, int index) {
     int len = strlen(key);
@@ -192,7 +226,7 @@ static char *get_key(int index) {
 -- */
 static void find_best_workgroup(void) {
     cl_event myEvent;
-    cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
+    cl_ulong startTime, endTime, min_time = CL_ULONG_MAX;
     size_t my_work_group = 1;
     cl_int ret_code;
     int i;
@@ -204,41 +238,54 @@ static void find_best_workgroup(void) {
     HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue");    
     printf("Max Group Work Size %d ", (int) max_group_size);
     local_work_size = 1;
-
+    max_group_size = get_task_max_work_group_size();
+    
+    // Set salt.
+    set_salt("$6$saltstring$");
+    
     // Set keys
-    for (i = 0; i < get_task_max_size(); i++) {
+    for (i = 0; i < max_keys_per_crypt; i++) {
         set_key("aaabaabaaa", i);
     }
-    HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_TRUE, 0,
+    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_TRUE, 0,
             sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer I");
     HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_TRUE, 0, 
-            sizeof (crypt_sha512_password) * get_task_max_size(), 
+            sizeof (crypt_sha512_password) * max_keys_per_crypt, 
             plaintext, 0, NULL, NULL),
             "Failed in clEnqueueWriteBuffer II");
-
+    
+    if (cpu(get_device_info()))
+        my_work_group = 1;
+    
+    else
+        my_work_group = 16;
+                
     // Find minimum time
-    for (my_work_group = 1; (int) my_work_group <= (int) get_task_max_work_group_size(); 
+    for (; (int) my_work_group <= (int) max_group_size; 
          my_work_group *= 2) {
+        advance_cursor();
         ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 
                 1, NULL, &max_keys_per_crypt, &my_work_group, 0, NULL, &myEvent);
         HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
 
         if (ret_code != CL_SUCCESS) {
-            printf("Error %d\n", ret_code); ///Better commented by default.
-            break;
+
+            if (ret_code != CL_INVALID_WORK_GROUP_SIZE)
+                printf("Error %d\n", ret_code); 
+            continue;
         }
         //Get profile information
-        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, 
+        HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, 
                 sizeof (cl_ulong), &startTime, NULL),
                 "Failed in clGetEventProfilingInfo I");
         HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, 
                 sizeof (cl_ulong), &endTime, NULL),
                 "Failed in clGetEventProfilingInfo II");
         HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
-        
-        if ((endTime - startTime) < kernelExecTimeNs) {
-            kernelExecTimeNs = endTime - startTime;
+
+        if ((endTime - startTime) * 1.01 < min_time) {
+            min_time = endTime - startTime;
             local_work_size = my_work_group;
         }
     }
@@ -257,19 +304,28 @@ static void find_best_workgroup(void) {
 static void find_best_kpc(void) {
     size_t num;
     cl_event myEvent;
-    cl_ulong startTime, endTime, tmpTime;
-    cl_ulong kernelExecTimeNs = CL_ULONG_MAX;
+    cl_ulong startTime, endTime, run_time, min_time = CL_ULONG_MAX; 
     cl_int ret_code;
-    int optimal_kpc = MIN_KEYS_PER_CRYPT;
-    int i;
     cl_uint *tmpbuffer;
+    int optimal_kpc = MIN_KEYS_PER_CRYPT, i, step = STEP;
+    int do_benchmark = 0;
+    unsigned int SHAspeed, bestSHAspeed = 0; 
+    char *tmp_value; 
 
     printf("Calculating best keys per crypt, this will take a while ");
+
+    if ((tmp_value = getenv("STEP"))){
+	step = atoi(tmp_value);
+        do_benchmark = 1;
+    }
     
-    for (num = get_task_max_size(); (int) num > MIN_KEYS_PER_CRYPT; num -= 4096) {
+    for (num = step; num < MAX_KEYS_PER_CRYPT; num += step) {
         release_clobj();
         create_clobj(num);
-        advance_cursor();
+        
+        if (! do_benchmark)
+            advance_cursor();
+        
         tmpbuffer = malloc(sizeof (crypt_sha512_hash) * num);
         
         if (tmpbuffer == NULL) {
@@ -280,12 +336,15 @@ static void find_best_kpc(void) {
         queue_prof = clCreateCommandQueue(context[gpu_id], devices[gpu_id], 
                 CL_QUEUE_PROFILING_ENABLE, &ret_code);
         HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue");
-
+        
+        // Set salt.
+        set_salt("$6$saltstring$");
+                
         // Set keys
         for (i = 0; i < num; i++) {
             set_key("aaabaabaaa", i);
         }
-        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], salt_buffer, CL_FALSE, 0,
+        HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0,
                 sizeof (crypt_sha512_salt), &salt, 0, NULL, NULL),
                 "Failed in clEnqueueWriteBuffer I");
         HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, 
@@ -309,19 +368,41 @@ static void find_best_kpc(void) {
                 sizeof (cl_ulong), &endTime, NULL),
                 "Failed in clGetEventProfilingInfo II");
         
+        free(tmpbuffer);
+        HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), 
+            "Failed in clReleaseCommandQueue");
         HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
-        tmpTime = endTime - startTime;
 
-        if (((int) (((float) (tmpTime) / num) * 10)) <= kernelExecTimeNs) {
-            kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10));
+        run_time = endTime - startTime;
+	SHAspeed = 5000 * num / (run_time / 1000000000.);
+
+        if (run_time < min_time)
+            min_time = run_time;
+
+        if (do_benchmark) {
+            fprintf(stderr, "kpc: %6zu\t%4lu c/s%14u rounds/s%8.3f sec per crypt_all()",
+                    num, (long) (num / (run_time / 1000000000.)), SHAspeed, 
+                    (float) run_time / 1000000000.);
+
+            if (run_time > 10000000000) {
+                fprintf(stderr, " - too slow\n");
+                break;
+            }
+        } else {
+            if (run_time > min_time * 10)
+                break;
+        }
+        if (SHAspeed > (1.01 * bestSHAspeed)) {
+            if (do_benchmark)
+                fprintf(stderr, "+");
+            bestSHAspeed = SHAspeed;
             optimal_kpc = num;
         }
-        free(tmpbuffer);
-        HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), 
-            "Failed in clReleaseCommandQueue");
+        if (do_benchmark)
+            fprintf(stderr, "\n");
     }
     printf("Optimal keys per crypt %d\n", optimal_kpc);
-    printf("to avoid this test on next run, put \""
+    printf("(to avoid this test on next run, put \""
         KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
         SUBSECTION_OPENCL "])\n", optimal_kpc);
     max_keys_per_crypt = optimal_kpc;
@@ -332,10 +413,17 @@ static void find_best_kpc(void) {
 /* ------- Initialization  ------- */
 static void init(struct fmt_main *pFmt) {
     char *tmp_value;
-    opencl_init("$JOHN/cryptsha512_kernel.cl", gpu_id, platform_id);
+    opencl_init_dev(gpu_id, platform_id);
+      
+    if (cpu(get_device_info()))
+        opencl_build_kernel("$JOHN/cryptsha512_CPU_kernel.cl", gpu_id);
+    
+    else
+        opencl_build_kernel("$JOHN/cryptsha512_kernel.cl", gpu_id);
+ 
     max_keys_per_crypt = get_task_max_size();
-    local_work_size = 0;
-
+    local_work_size = 32; //Default safe value.
+      
     // create kernel to execute
     crypt_kernel = clCreateKernel(program[gpu_id], "kernel_crypt", &ret_code);
     HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
@@ -379,76 +467,32 @@ static void init(struct fmt_main *pFmt) {
     }
     printf("Local work size (LWS) %d, Keys per crypt (KPC) %Zd\n",
            (int) local_work_size, max_keys_per_crypt);   
-    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;     
+    pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
 }
 
 /* ------- Check if the ciphertext if a valid SHA-512 crypt ------- */
 static int valid(char *ciphertext, struct fmt_main *pFmt) {
-	uint32_t i, j;
-	int len = strlen(ciphertext);
-
-	if (strncmp(ciphertext, "$6$", 3) != 0)
-		return 0;
-	char *p = strrchr(ciphertext, '$');
-	if (p == NULL)
-		return 0;
-	for (i = p - ciphertext + 1; i < len; i++) {
-		int found = 0;
-		for (j = 0; j < 64; j++)
-			if (itoa64[j] == ARCH_INDEX(ciphertext[i]))
-				found = 1;
-		if (found == 0) {
-			puts("not found");
-			return 0;
-		}
-	}
-	if (len - (p - ciphertext + 1) != 86)
-		return 0;
-	return 1;
-}
-
-/* ------- Salt functions ------- */
-static void *get_salt(char *ciphertext) {
-    int end = 0, i, len = strlen(ciphertext);
-    for (i = len - 1; i >= 0; i--)
-        if (ciphertext[i] == '$') {
-            end = i;
-            break;
-        }
-
-    static unsigned char ret[50];
-    for (i = 0; i < end; i++)
-        ret[i] = ciphertext[i];
-    ret[end] = 0;
-    return (void *) ret;
-}
+    uint32_t i, j;
+    int len = strlen(ciphertext);
 
-static void set_salt(void *salt_info) {    
-    int len = strlen(salt_info);
-    unsigned char offset = 0;
-    static char currentsalt[64];
-    
-    memcpy(currentsalt, (char *) salt_info, len + 1);
-    salt.rounds = ROUNDS_DEFAULT;
-
-    if (strncmp((char *) "$6$", (char *) currentsalt, 3) == 0)
-        offset += 3;
-
-    if (strncmp((char *) currentsalt + offset, (char *) "rounds=", 7) == 0) {
-        const char *num = currentsalt + offset + 7;
-        char *endp;
-        unsigned long int srounds = strtoul(num, &endp, 10);
-
-        if (*endp == '$') {
-            endp += 1;
-            salt.rounds =
-                    MAX(ROUNDS_MIN, MIN(srounds, ROUNDS_MAX));
-        }
-        offset = endp - currentsalt;
+    if (strncmp(ciphertext, "$6$", 3) != 0)
+            return 0;
+    char *p = strrchr(ciphertext, '$');
+    if (p == NULL)
+            return 0;
+    for (i = p - ciphertext + 1; i < len; i++) {
+            int found = 0;
+            for (j = 0; j < 64; j++)
+                    if (itoa64[j] == ARCH_INDEX(ciphertext[i]))
+                            found = 1;
+            if (found == 0) {
+                    puts("not found");
+                    return 0;
+            }
     }
-    memcpy(salt.salt, currentsalt + offset, SALT_SIZE);
-    salt.length = strlen((char *) salt.salt);
-    salt.length = (salt.length > SALT_SIZE ? SALT_SIZE : salt.length);
+    if (len - (p - ciphertext + 1) != 86)
+            return 0;
+    return 1;
 }
 
 /* ------- To binary functions ------- */
@@ -459,19 +503,19 @@ static int findb64(char c) {
 
 static void magic(char *crypt, unsigned char *alt) {
 #define _24bit_from_b64(I,B2,B1,B0) \
-  {\
-      unsigned char c1=findb64(crypt[I+0]);\
-      unsigned char c2=findb64(crypt[I+1]);\
-      unsigned char c3=findb64(crypt[I+2]);\
-      unsigned char c4=findb64(crypt[I+3]);\
-      unsigned int w=c4<<18|c3<<12|c2<<6|c1;\
-      unsigned char b2=w&0xff;w>>=8;\
-      unsigned char b1=w&0xff;w>>=8;\
-      unsigned char b0=w&0xff;w>>=8;\
-      alt[B2]=b0;\
-      alt[B1]=b1;\
-      alt[B0]=b2;\
-  }
+    {\
+        unsigned char c1=findb64(crypt[I+0]);\
+        unsigned char c2=findb64(crypt[I+1]);\
+        unsigned char c3=findb64(crypt[I+2]);\
+        unsigned char c4=findb64(crypt[I+3]);\
+        unsigned int w=c4<<18|c3<<12|c2<<6|c1;\
+        unsigned char b2=w&0xff;w>>=8;\
+        unsigned char b1=w&0xff;w>>=8;\
+        unsigned char b0=w&0xff;w>>=8;\
+        alt[B2]=b0;\
+        alt[B1]=b1;\
+        alt[B0]=b2;\
+    }
     _24bit_from_b64(0, 0, 21, 42);
     _24bit_from_b64(4, 22, 43, 1);
     _24bit_from_b64(8, 44, 2, 23);
@@ -557,7 +601,7 @@ static void crypt_all(int count) {
  
     //Do the work
     HANDLE_CLERROR(clFinish(queue[gpu_id]), "failed in clFinish");
-    new_keys = 0;
+    new_keys = 0;   
 }
 
 /* ------- Binary Hash functions group ------- */