diff --git a/Makefile.am b/Makefile.am
index e82b72d2a5..0bbf65f42d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -72,7 +72,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512_alexis.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/cuda_x11_echo_alexis.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
-			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu x13/cuda_x13_fugue512_alexis.cu \
+			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_hamsi512_alexis.cu x13/cuda_x13_fugue512.cu x13/cuda_x13_fugue512_alexis.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x14_shabal512_alexis.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
@@ -199,4 +199,3 @@ scrypt/titan_kernel.o: scrypt/titan_kernel.cu
 
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
-
diff --git a/cuda_helper_alexis.h b/cuda_helper_alexis.h
new file mode 100644
index 0000000000..841bd232b5
--- /dev/null
+++ b/cuda_helper_alexis.h
@@ -0,0 +1,729 @@
+#ifndef CUDA_HELPER_H
+#define CUDA_HELPER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef __INTELLISENSE__
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifndef UINT32_MAX
+/* slackware need that */
+#define UINT32_MAX UINT_MAX
+#endif
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+extern "C" short device_map[MAX_GPUS];
+extern "C"  long device_sm[MAX_GPUS];
+
+extern int cuda_arch[MAX_GPUS];
+
+// common functions
+extern int cuda_get_arch(int thr_id);
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_free(int thr_id);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+extern __device__ __device_builtin__ void __syncthreads(void);
+extern __device__ __device_builtin__ void __threadfence(void);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+#ifndef SPH_C32
+#define SPH_C32(x) (x)
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) (x)
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#if __CUDA_ARCH__ < 320
+// Host and Compute 3.0
+#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define __ldg(x) (*(x))
+#else
+// Compute 3.2+
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+#define AS_U32(addr)   *((uint32_t*)(addr))
+#define AS_U64(addr)   *((uint64_t*)(addr))
+#define AS_UINT2(addr) *((uint2*)(addr))
+#define AS_UINT4(addr) *((uint4*)(addr))
+#define AS_UL2(addr)   *((ulonglong2*)(addr))
+
+/*********************************************************************/
+// Macros to catch CUDA errors in CUDA runtime calls
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
+/*********************************************************************/
+
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI){
+	return __double_as_longlong(__hiloint2double(HI, LO));
+//	return (uint64_t)LO | (((uint64_t)HI) << 32);
+}
+
+// das Hi Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
+}
+
+// das Lo Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+// Endian Drehung für 32 Bit Typen
+#if defined(__CUDA_ARCH__)
+__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
+// das Lo Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) {
+	return (uint32_t)__double2loint(__longlong_as_double(x));
+//	return (uint32_t)(x & 0xFFFFFFFFULL);
+}
+
+// das Hi Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) {
+	return (uint32_t)__double2hiint(__longlong_as_double(x));
+//	return (uint32_t)(x >> 32);
+}
+
+
+__device__ __forceinline__ uint2 cuda_swab64_U2(uint2 a)
+{
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint2 result;
+	result.y = __byte_perm(a.x, 0, 0x0123);
+	result.x = __byte_perm(a.y, 0, 0x0123);
+	return result;
+}
+
+#if defined(__CUDA_ARCH__)
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
+{
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
+	return (result << 32) | __byte_perm(_HIDWORD(x), 0, 0x0123);
+}
+#else
+/* host */
+#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif
+
+// swap two uint32_t without extra registers
+__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) {
+	x ^= y; y = x ^ y; x ^= y;
+}
+// for other types...
+#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; }
+
+static __host__ __device__ __forceinline__ uint2 vectorize(uint64_t v) {
+	uint2 result;
+#if defined(__CUDA_ARCH__)
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(v));
+#else
+	result.x = (uint32_t)(v);
+	result.y = (uint32_t)(v >> 32);
+#endif
+	return result;
+}
+
+static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
+#if defined(__CUDA_ARCH__)
+	return MAKE_ULONGLONG(v.x, v.y);
+#else
+	return (((uint64_t)v.y) << 32) + v.x;
+#endif
+}
+
+#if defined(__CUDA_ARCH__)
+	// Compute 3.2+
+	#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+	#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#else
+	// Host and Compute 3.0
+	#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+	#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+	#define __ldg(x) (*(x))
+#endif
+
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a){
+	return __byte_perm(a, 0, 0x1032);
+}
+__device__ __forceinline__ 
+uint32_t ROL8(const uint32_t a){
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__ 
+uint32_t ROR8(const uint32_t a){
+	return __byte_perm(a, 0, 0x0321);
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(uint64_t a, uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %2, %3;\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(d) ,"l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+
+__device__ __forceinline__
+uint64_t xor9(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const  uint64_t h,const  uint64_t i)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(h) ,"l"(i));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(g));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));	
+	return result;
+}
+
+__device__ __forceinline__
+uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+
+static __device__ __forceinline__ uint2 xorswap32(uint2 u, uint2 v)
+{
+	uint2 result;
+	result.y = u.x ^ v.x;
+	result.x = u.y ^ v.y;
+	return result;
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t andor(const uint64_t a,const uint64_t b,const uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m,n;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m ;\n\t"
+	"}\n"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+//	return ((a | b) & c) | (a & b);
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t shr_u64(const uint64_t x, uint32_t n){
+	uint64_t result;
+	asm ("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+	return result;
+//	return x >> n;
+}
+
+__device__ __forceinline__
+uint64_t shl_u64(const uint64_t x, uint32_t n){
+	uint64_t result;
+	asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+	return result;
+//	return x << n;
+}
+
+__device__ __forceinline__
+uint32_t shr_u32(const uint32_t x,uint32_t n) {
+	uint32_t result;
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+	return result;
+//	return x >> n;
+}
+
+__device__ __forceinline__
+uint32_t shl_u32(const uint32_t x,uint32_t n) {
+	uint32_t result;
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+	return result;
+//	return x << n;
+}
+
+// 64-bit ROTATE RIGHT
+#if defined(__CUDA_ARCH__)
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	const uint2 tmp = vectorize(value);
+		
+	if(offset == 8) {
+		result.x = __byte_perm(tmp.x, tmp.y, 0x4321);
+		result.y = __byte_perm(tmp.y, tmp.x, 0x4321);
+	}
+	else if(offset == 16) {
+		result.x = __byte_perm(tmp.x, tmp.y, 0x5432);
+		result.y = __byte_perm(tmp.y, tmp.x, 0x5432);	
+	}	
+	else if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
+	}
+	return devectorize(result);
+}
+#else
+/* host */
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#if defined(__CUDA_ARCH__)
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	const uint2 tmp = vectorize(value);
+	if(offset == 8){
+		result.x = __byte_perm(tmp.x, tmp.y, 0x2107);
+		result.y = __byte_perm(tmp.y, tmp.x, 0x2107);
+	}
+	else if(offset == 16) {
+		result.x = __byte_perm(tmp.x, tmp.y, 0x1076);
+		result.y = __byte_perm(tmp.y, tmp.x, 0x1076);
+	}
+	else if(offset == 24) {
+		result.x = __byte_perm(tmp.x, tmp.y, 0x0765);
+		result.y = __byte_perm(tmp.y, tmp.x, 0x0765);
+	}
+	else if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
+	}
+	return devectorize(result);
+}
+#else
+/* host */
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+__device__ __forceinline__
+uint64_t SWAPDWORDS(uint64_t value){
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+}
+
+__device__ __forceinline__
+uint2 SWAPDWORDS2(uint2 value){
+	return make_uint2(value.y, value.x);
+}
+
+/* lyra2/bmw - uint2 vector's operators */
+
+__device__ __forceinline__ 
+uint2 SHL8(const uint2 a){
+	uint2 result;
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+	result.x = __byte_perm(a.x, 0, 0x2107);
+
+	return result;
+}
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+#if defined(__CUDA_ARCH__)
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+#else
+	lo = (uint32_t)(x);
+	hi = (uint32_t)(x >> 32);
+#endif
+}
+
+/**
+ * uint2 direct ops by c++ operator definitions
+ */
+static __device__ __forceinline__ uint2 operator^ (const uint2 a,const uint32_t b) { return make_uint2(a.x^ b, a.y); }
+static __device__ __forceinline__ uint2 operator^ (const uint2 a,const uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (const uint2 a,const uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (const uint2 a,const uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (const uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a,const uint2 b) { a = a ^ b; }
+
+static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint2 b) {
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return vectorize(devectorize(a) + devectorize(b));
+#endif
+}
+
+static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint64_t b) {
+	return vectorize(devectorize(a) + b);
+}
+
+static __device__ __forceinline__ void operator+= (uint2 &a,const uint2 b) { a = a + b; }
+
+static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint2 b) {
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm("{\n\t"
+		"sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"
+		"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+#else
+	return vectorize(devectorize(a) - devectorize(b));
+#endif
+}
+static __device__ __forceinline__ void operator-= (uint2 &a,const uint2 b) { a = a - b; }
+
+static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint32_t b)
+{
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm("add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+#else
+	return vectorize(devectorize(a) + b);
+#endif
+}
+
+static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint64_t b) {
+	return vectorize(devectorize(a) - b);
+}
+static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint32_t b)
+{
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm("sub.cc.u32 %0,%2,%4; \n\t"
+		"subc.u32 %1,%3,%5;   \n\t"		
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+#else
+	return vectorize(devectorize(a) - b);
+#endif
+}
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (const uint2 a,const uint2 b){
+	uint2 result;
+	asm("{\n\t"
+		"mul.lo.u32        %0,%2,%4;  \n\t"
+		"mul.hi.u32        %1,%2,%4;  \n\t"
+		"mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
+		"madc.lo.u32      %1,%3,%5,%1; \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+
+// uint2 ROR/ROL methods
+__device__ __forceinline__
+uint2 ROR2(const uint2 a, const uint32_t offset){
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	} else /* if (offset < 64) */ {
+		/* offset SHOULD BE < 64 ! */
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	if (!offset)
+		result = a;
+	else if (offset < 32) {
+		result.y = ((a.y >> offset) | (a.x << (32 - offset)));
+		result.x = ((a.x >> offset) | (a.y << (32 - offset)));
+	} else if (offset == 32) {
+		result.y = a.x;
+		result.x = a.y;
+	} else {
+		result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+		result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+	}
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 ROL2(const uint2 a, const uint32_t offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	if (!offset)
+		result = a;
+	else
+		result = ROR2(a, 64 - offset);
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+/* Byte aligned Rotations (lyra2) */
+__device__ __forceinline__
+uint2 ROL8(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+	return result;
+}
+__device__ __forceinline__
+uint2 ROR8(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x4321);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+	return result;
+}
+__device__ __forceinline__
+uint2 ROR16(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x5432);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+__device__ __forceinline__
+uint2 ROL16(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x1076);
+
+	return result;
+}
+
+__device__ __forceinline__
+uint2 ROR24(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+__device__ __forceinline__
+uint2 ROL24(const uint2 a){
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x0765);
+	return result;
+}
+/* uint2 for bmw512 - to double check later */
+
+__device__ __forceinline__
+static uint2 SHL2(const uint2 a,const uint32_t n) {
+	uint64_t result;
+	const uint64_t x = devectorize(a);
+	asm ("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+	return vectorize(result);
+}
+
+__device__ __forceinline__
+static uint2 SHR2(const uint2 a,const uint32_t n){
+
+	uint64_t result;
+	const uint64_t x = devectorize(a);
+	asm ("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+	return vectorize(result);
+}
+
+__device__ __forceinline__
+uint32_t xor3x(uint32_t a,uint32_t b,uint32_t c){
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+	#else
+		result = a^b^c;
+	#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 xor3x(const uint2 a,const uint2 b,const uint2 c){
+	uint2 result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+		asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+	#else
+		result = a^b^c;
+	#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 chi(const uint2 a,const uint2 b,const uint2 c){ //keccak - chi
+	uint2 result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+		asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+	#else
+		result = a ^ (~b) & c;
+	#endif
+	return result;
+}
+__device__ __forceinline__
+uint32_t chi(const uint32_t a,const uint32_t b,const uint32_t c){ //keccak - chi
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+	#else
+		result = a ^ (~b) & c;
+	#endif
+	return result;
+}
+__device__ __forceinline__
+uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) {
+	uint32_t ret;
+	asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits));
+	return ret;
+
+}
+
+__device__ __forceinline__
+uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) {
+	uint32_t ret;
+	asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits));
+	return ret;
+}
+#endif // #ifndef CUDA_HELPER_H
+
diff --git a/cuda_vectors_alexis.h b/cuda_vectors_alexis.h
index 9ff5d6efc4..a799994449 100644
--- a/cuda_vectors_alexis.h
+++ b/cuda_vectors_alexis.h
@@ -10,7 +10,7 @@
 #define __LDG_PTR   "r"
 #endif
 
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 
 #if __CUDA_ARCH__ < 320 && !defined(__ldg4)
 #define __ldg4(x) (*(x))
@@ -415,13 +415,13 @@ static __forceinline__ __device__ uint2x4 rotate2x4(const uint2x4 &vec4, uint32_
 	uint2x4 ret;
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x.x) : "r"(vec4.x.x), "r"(vec4.x.x), "r"(shift));
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x.y) : "r"(vec4.x.y), "r"(vec4.x.y), "r"(shift));
-		
+
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y.x) : "r"(vec4.y.x), "r"(vec4.y.x), "r"(shift));
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y.y) : "r"(vec4.y.y), "r"(vec4.y.y), "r"(shift));
-		
+
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z.x) : "r"(vec4.z.x), "r"(vec4.z.x), "r"(shift));
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z.y) : "r"(vec4.z.y), "r"(vec4.z.y), "r"(shift));
-		
+
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w.x) : "r"(vec4.w.x), "r"(vec4.w.x), "r"(shift));
 	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w.y) : "r"(vec4.w.y), "r"(vec4.w.y), "r"(shift));
 	return ret;
@@ -630,25 +630,5 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
 #endif
 }
 
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__
-uint32_t ROL8(const uint32_t a){
-    return __byte_perm(a, 0, 0x2103);
-}
-
-__device__ __forceinline__
-uint32_t ROR8(const uint32_t a){
-    return __byte_perm(a, 0, 0x0321);
-}
-
-__device__ __forceinline__
-uint32_t ROL16(const uint32_t a){
-    return __byte_perm(a, 0, 0x1032);
-}
-#else
-    #define ROL8(u)  ROTL32(u, 8)
-    #define ROR8(u)  ROTR32(u, 8)
-    #define ROL16(u) ROTL32(u,16)
-#endif
 
 #endif // #ifndef CUDA_LYRA_VECTOR_H
diff --git a/x11/cuda_x11_aes_alexis.cuh b/x11/cuda_x11_aes_alexis.cuh
index b3bc9a0333..b8a684d7d1 100644
--- a/x11/cuda_x11_aes_alexis.cuh
+++ b/x11/cuda_x11_aes_alexis.cuh
@@ -68,7 +68,7 @@ void aes_gpu_init128(uint32_t sharedMemory[4][256])
 {
 	/* each thread startup will fill 2 uint32 */
 	uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
-	
+
 	sharedMemory[0][(threadIdx.x<<1) + 0] = temp.x;
 	sharedMemory[0][(threadIdx.x<<1) + 1] = temp.y;
 	sharedMemory[1][(threadIdx.x<<1) + 0] = ROL8(temp.x);
@@ -85,7 +85,7 @@ void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
 	if(threadIdx.x<128){
 		/* each thread startup will fill 2 uint32 */
 		uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
-	
+
 		sharedMemory[0][(threadIdx.x<<1) + 0] = temp.x;
 		sharedMemory[0][(threadIdx.x<<1) + 1] = temp.y;
 		sharedMemory[1][(threadIdx.x<<1) + 0] = ROL8(temp.x);
@@ -105,17 +105,17 @@ static void aes_round(const uint32_t sharedMemory[4][256],const uint32_t x0,cons
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-	
+
 	y1^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
 	y0^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
 	#ifdef INTENSIVE_GMF
-	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);	
+	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
 	#else
 	y2^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
 	#endif
 
-	y0^= k0;	
+	y0^= k0;
 
 	y2^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
 	y1^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
@@ -135,13 +135,13 @@ static void aes_round_LDG(const uint32_t sharedMemory[4][256],const uint32_t x0,
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-	
+
 	y1^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
 	y0^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
-	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);	
+	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
 
-	y0^= k0;	
+	y0^= k0;
 
 	y2^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
 	y1^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
@@ -161,7 +161,7 @@ static void aes_round(const uint32_t sharedMemory[4][256],const uint32_t x0,cons
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-	
+
 	#ifdef INTENSIVE_GMF
 	y1^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
 	#else
@@ -170,7 +170,7 @@ static void aes_round(const uint32_t sharedMemory[4][256],const uint32_t x0,cons
 	y0^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
 	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-	
+
 	y2^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
 	y1^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
 	y0^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
@@ -189,12 +189,12 @@ static void aes_round_LDG(const uint32_t sharedMemory[4][256],const uint32_t x0,
 	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
 	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
 	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
-	
+
 	y1^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
 	y0^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
 	y3^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
 	y2^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
-	
+
 	y2^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
 	y1^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
 	y0^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
@@ -206,7 +206,7 @@ static void aes_round_LDG(const uint32_t sharedMemory[4][256],const uint32_t x0,
 	y0^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
 }
 
-__device__ __forceinline__ 
+__device__ __forceinline__
 static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
 
 	uint32_t y0, y1, y2, y3;
@@ -219,7 +219,7 @@ static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32
 	k0++;
 }
 
-__device__ __forceinline__ 
+__device__ __forceinline__
 static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0){
 
 	uint32_t y0, y1, y2, y3;
diff --git a/x11/cuda_x11_echo_alexis.cu b/x11/cuda_x11_echo_alexis.cu
index bbd2f1cc2f..8176d9a114 100644
--- a/x11/cuda_x11_echo_alexis.cu
+++ b/x11/cuda_x11_echo_alexis.cu
@@ -3,7 +3,7 @@
 	Provos Alexis - 2016
 */
 
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 #include "cuda_vectors_alexis.h"
 
 #define INTENSIVE_GMF
@@ -34,7 +34,7 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
 		W[i +40] = t[1];
 		W[i +56] = t[2];
 		W[i +44] = W[i +28];
-				
+
 		W[i +28] = W[i +12];
 		W[i +12] = t[3];
 		W[i +36] = W[i +52];
@@ -94,7 +94,7 @@ static void x11_echo512_gpu_hash_64_final_alexis(uint32_t threads, uint64_t *g_h
 	uint32_t h[16];
 
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	
+
 	if (thread < threads){
 
 		const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
@@ -207,7 +207,7 @@ static void x11_echo512_gpu_hash_64_final_alexis(uint32_t threads, uint64_t *g_h
 			W[48 + i + 8] = cdx ^ ab ^ d;
 			W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
 		}
-		
+
 		for (int k = 1; k < 9; k++)
 			echo_round_alexis(sharedMemory,W,k0);
 		// Big Sub Words
@@ -221,27 +221,27 @@ static void x11_echo512_gpu_hash_64_final_alexis(uint32_t threads, uint64_t *g_h
 //		AES_2ROUND(sharedMemory,W[ 8], W[ 9], W[10], W[11], k0);
 		aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
-		
+
 //		AES_2ROUND(sharedMemory,W[20], W[21], W[22], W[23], k0);
 		aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
 //		AES_2ROUND(sharedMemory,W[28], W[29], W[30], W[31], k0);
 		aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
-		
+
 //		AES_2ROUND(sharedMemory,W[32], W[33], W[34], W[35], k0);
 		aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
 //		AES_2ROUND(sharedMemory,W[40], W[41], W[42], W[43], k0);
 		aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
-		
+
 		aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
 //		AES_2ROUND(sharedMemory,W[60], W[61], W[62], W[63], k0);
 		aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
 		aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
-		
+
 		uint32_t bc = W[22] ^ W[42];
 		uint32_t t2 = (bc & 0x80808080);
 		W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
@@ -295,10 +295,10 @@ static void x11_echo512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash)
 
 		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
-		
+
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
-		
+
 		__syncthreads();
 
 		const uint32_t P[48] = {
diff --git a/x11/cuda_x11_shavite512_alexis.cu b/x11/cuda_x11_shavite512_alexis.cu
index 38ad27fa4f..db931e5a83 100644
--- a/x11/cuda_x11_shavite512_alexis.cu
+++ b/x11/cuda_x11_shavite512_alexis.cu
@@ -2,7 +2,7 @@
 	Based on Tanguy Pruvot's repo
 	Provos Alexis - 2016
 */
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 #include "cuda_vectors_alexis.h"
 
 #define INTENSIVE_GMF
@@ -131,7 +131,7 @@ void x11_shavite512_gpu_hash_64_alexis(const uint32_t threads, uint64_t *g_hash)
 		*(uint2x4*)&r[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&r[ 8] = __ldg4((uint2x4*)&Hash[ 4]);
 		__syncthreads();
-		
+
 		*(uint2x4*)&p[ 0] = *(uint2x4*)&state[ 0];
 		*(uint2x4*)&p[ 2] = *(uint2x4*)&state[ 8];
 		r[16] = 0x80; r[17] = 0; r[18] = 0; r[19] = 0;
@@ -231,8 +231,8 @@ void x11_shavite512_gpu_hash_64_alexis(const uint32_t threads, uint64_t *g_hash)
 
 		/* round 3, 7, 11 */
 		round_3_7_11(sharedMemory,r,p,x);
-		
-		
+
+
 		/* round 4, 8, 12 */
 		round_4_8_12(sharedMemory,r,p,x);
 
@@ -272,7 +272,7 @@ void x11_shavite512_gpu_hash_64_alexis(const uint32_t threads, uint64_t *g_hash)
 		x ^= *(uint4*)&r[28];
 		AES_ROUND_NOKEY(sharedMemory, &x);
 		p[ 1] ^= x;
-	
+
 		*(uint4*)&r[ 0] ^= *(uint4*)&r[25];
 		x = p[ 3] ^ *(uint4*)&r[ 0];
 		AES_ROUND_NOKEY(sharedMemory, &x);
@@ -379,7 +379,7 @@ void x11_shavite512_gpu_hash_64_alexis(const uint32_t threads, uint64_t *g_hash)
 
 		/* round 3, 7, 11 */
 		round_3_7_11(sharedMemory,r,p,x);
-		
+
 		/* round 4, 8, 12 */
 		round_4_8_12(sharedMemory,r,p,x);
 
diff --git a/x13/cuda_x13_fugue512_alexis.cu b/x13/cuda_x13_fugue512_alexis.cu
index fad4f7883f..cccec877eb 100644
--- a/x13/cuda_x13_fugue512_alexis.cu
+++ b/x13/cuda_x13_fugue512_alexis.cu
@@ -1,12 +1,12 @@
 /*
  * Quick and dirty addition of Fugue-512 for X13
- * 
+ *
  * Built on cbuchner1's implementation, actual hashing code
  * heavily based on phm's sgminer
  *
- * 
+ *
  */
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 #include "miner.h"
 #include "cuda_vectors_alexis.h"
 /*
@@ -15,7 +15,7 @@
  * ==========================(LICENSE BEGIN)============================
  *
  * Copyright (c) 2014-2016  phm, Provos Alexis
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software", to deal in the Software without restriction, including
@@ -23,10 +23,10 @@
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -253,11 +253,16 @@ void x13_fugue512_gpu_hash_64_alexis(uint32_t threads, uint64_t *g_hash)
 		uint32_t B[ 9];
 
 		uint32_t Hash[16];
-		
+
 		*(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
 		*(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
+
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(Hash[i]);
+
 		__syncthreads();
-		
+
 		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
 		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
 		*(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
@@ -298,7 +303,7 @@ void x13_fugue512_gpu_hash_64_alexis(uint32_t threads, uint64_t *g_hash)
 		S[ 4] = cuda_swab32(S[ 9]);	S[ 5] = cuda_swab32(S[10]);	S[ 6] = cuda_swab32(S[11]);	S[ 7] = cuda_swab32(S[12]);
 		S[ 8] = cuda_swab32(S[18]);	S[ 9] = cuda_swab32(S[19]);	S[10] = cuda_swab32(S[20]);	S[11] = cuda_swab32(S[21]);
 		S[12] = cuda_swab32(S[27]);	S[13] = cuda_swab32(S[28]);	S[14] = cuda_swab32(S[29]);	S[15] = cuda_swab32(S[30]);
-		
+
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&S[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&S[ 8];
 	}
@@ -327,10 +332,10 @@ void x13_fugue512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __re
 		uint32_t S[36];
 		uint32_t B[ 9];
 		uint32_t Hash[16];
-		
+
 		*(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
 		*(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
-		__syncthreads();		
+		__syncthreads();
 		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
 		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
 		*(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
@@ -345,7 +350,7 @@ void x13_fugue512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __re
 		for (int i = 0; i < 32; i++){
 			mROR3;
 			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);			
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
 		}
 		#pragma unroll
 		for (int i = 0; i < 12; i++) {
@@ -373,12 +378,12 @@ void x13_fugue512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __re
 		SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
 
 		S[ 3] = cuda_swab32(S[3]);	S[ 4] = cuda_swab32(S[4]^S[ 0]);
-		
+
 		const uint64_t check = *(uint64_t*)&S[ 3];
 		if(check <= target){
 			uint32_t tmp = atomicExch(&resNonce[0], thread);
 			if (tmp != UINT32_MAX)
-				resNonce[1] = tmp;		
+				resNonce[1] = tmp;
 		}
 	}
 }
diff --git a/x13/cuda_x13_hamsi512_alexis.cu b/x13/cuda_x13_hamsi512_alexis.cu
new file mode 100644
index 0000000000..dc3b32be27
--- /dev/null
+++ b/x13/cuda_x13_hamsi512_alexis.cu
@@ -0,0 +1,299 @@
+/*
+ * Quick Hamsi-512 for X13
+ * by tsiv - 2014
+ *
+ * Provos Alexis - 2016
+ */
+
+#include "miner.h"
+#include "cuda_helper_alexis.h"
+#include "cuda_vectors_alexis.h"
+
+static __constant__ const uint32_t d_alpha_n[] = {
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,	0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
+	0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
+};
+
+static __constant__ const uint32_t d_alpha_f[] = {
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,	0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
+	0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
+};
+
+static __constant__ const uint32_t c_c[] = {
+		0x73746565, 0x6c706172, 0x6b204172, 0x656e6265, 0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
+		0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48, 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
+};
+
+static __constant__ const uint32_t d_T512[1024] = {
+	0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
+	0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262,
+	0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
+	0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da,
+	0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
+	0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab,
+	0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2,
+	0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526,
+	0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
+	0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9,
+	0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
+	0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a,
+	0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
+	0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf,
+	0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
+	0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9,
+	0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
+	0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f,
+	0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
+	0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1,
+	0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b,
+	0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55,
+	0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
+	0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f,
+	0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
+	0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d,
+	0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
+	0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257,
+	0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
+	0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59,
+	0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
+	0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd,
+	0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
+	0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004,
+	0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b,
+	0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0,
+	0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
+	0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198,
+	0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
+	0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55,
+	0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
+	0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17,
+	0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
+	0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888,
+	0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
+	0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7,
+	0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
+	0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e,
+	0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f,
+	0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908,
+	0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
+	0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7,
+	0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
+	0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e,
+	0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
+	0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4,
+	0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
+	0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe,
+	0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
+	0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac,
+	0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
+	0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7,
+	0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
+	0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94
+};
+
+#define SBOX(a, b, c, d) { \
+		uint32_t t; \
+		t =(a); \
+		a =(a & c) ^ d; \
+		c =(c ^ b) ^ a; \
+		d =(d | t) ^ b; \
+		b = d; \
+		d =((d | (t ^ c)) ^ a); \
+		a&= b; \
+		t^=(c ^ a); \
+		b = b ^ d ^ t; \
+		(a) = (c); \
+		(c) = (b); \
+		(b) = (d); \
+		(d) = (~t); \
+	}
+
+#define HAMSI_L(a, b, c, d) { \
+		(a) = ROTL32(a, 13); \
+		(c) = ROTL32(c, 3); \
+		(b) ^= (a) ^ (c); \
+		(d) ^= (c) ^ ((a) << 3); \
+		(b) = ROTL32(b, 1); \
+		(d) = ROTL32(d, 7); \
+		(a) = ROTL32(a ^ b ^ d, 5); \
+		(c) = ROTL32(c ^ d ^ (b<<7), 22); \
+	}
+
+#define ROUND_BIG(rc, alpha) { \
+		m[ 0] ^= alpha[ 0]; \
+		c[ 4] ^= alpha[ 8]; \
+		m[ 8] ^= alpha[16]; \
+		c[12] ^= alpha[24]; \
+		m[ 1] ^= alpha[ 1] ^ (rc); \
+		c[ 5] ^= alpha[ 9]; \
+		m[ 9] ^= alpha[17]; \
+		c[13] ^= alpha[25]; \
+		c[ 0] ^= alpha[ 2]; \
+		m[ 4] ^= alpha[10]; \
+		c[ 8] ^= alpha[18]; \
+		m[12] ^= alpha[26]; \
+		c[ 1] ^= alpha[ 3]; \
+		m[ 5] ^= alpha[11]; \
+		c[ 9] ^= alpha[19]; \
+		m[13] ^= alpha[27]; \
+		m[ 2] ^= alpha[ 4]; \
+		c[ 6] ^= alpha[12]; \
+		m[10] ^= alpha[20]; \
+		c[14] ^= alpha[28]; \
+		m[ 3] ^= alpha[ 5]; \
+		c[ 7] ^= alpha[13]; \
+		m[11] ^= alpha[21]; \
+		c[15] ^= alpha[29]; \
+		c[ 2] ^= alpha[ 6]; \
+		m[ 6] ^= alpha[14]; \
+		c[10] ^= alpha[22]; \
+		m[14] ^= alpha[30]; \
+		c[ 3] ^= alpha[ 7]; \
+		m[ 7] ^= alpha[15]; \
+		c[11] ^= alpha[23]; \
+		m[15] ^= alpha[31]; \
+		SBOX(m[ 0], c[ 4], m[ 8], c[12]); \
+		SBOX(m[ 1], c[ 5], m[ 9], c[13]); \
+		SBOX(c[ 0], m[ 4], c[ 8], m[12]); \
+		SBOX(c[ 1], m[ 5], c[ 9], m[13]); \
+		HAMSI_L(m[ 0], c[ 5], c[ 8], m[13]); \
+		SBOX(m[ 2], c[ 6], m[10], c[14]); \
+		HAMSI_L(m[ 1], m[ 4], c[ 9], c[14]); \
+		SBOX(m[ 3], c[ 7], m[11], c[15]); \
+		HAMSI_L(c[ 0], m[ 5], m[10], c[15]); \
+		SBOX(c[ 2], m[ 6], c[10], m[14]); \
+		HAMSI_L(c[ 1], c[ 6], m[11], m[14]); \
+		SBOX(c[ 3], m[ 7], c[11], m[15]); \
+		HAMSI_L(m[ 2], c[ 7], c[10], m[15]); \
+		HAMSI_L(m[ 3], m[ 6], c[11], c[12]); \
+		HAMSI_L(c[ 2], m[ 7], m[ 8], c[13]); \
+		HAMSI_L(c[ 3], c[ 4], m[ 9], m[12]); \
+		HAMSI_L(m[ 0], c[ 0], m[ 3], c[ 3]); \
+		HAMSI_L(m[ 8], c[ 9], m[11], c[10]); \
+		HAMSI_L(c[ 5], m[ 5], c[ 6], m[ 6]); \
+		HAMSI_L(c[13], m[12], c[14], m[15]); \
+	}
+
+__global__ __launch_bounds__(384,2)
+void x13_hamsi512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash){
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread<<4];
+		uint8_t h1[64];
+		*(uint2x4*)&h1[ 0] = *(uint2x4*)&Hash[0];
+		*(uint2x4*)&h1[32] = *(uint2x4*)&Hash[8];
+
+		uint32_t c[16], h[16], m[16];
+		*(uint16*)&c[ 0] = *(uint16*)&c_c[ 0];
+		*(uint16*)&h[ 0] = *(uint16*)&c_c[ 0];
+
+		const uint32_t *tp;
+		uint32_t dm;
+
+		for(int i = 0; i < 64; i += 8) {
+			tp = &d_T512[0];
+
+			dm = -(h1[i] & 1);
+			m[ 0] = dm & tp[ 0]; m[ 1] = dm & tp[ 1];
+			m[ 2] = dm & tp[ 2]; m[ 3] = dm & tp[ 3];
+			m[ 4] = dm & tp[ 4]; m[ 5] = dm & tp[ 5];
+			m[ 6] = dm & tp[ 6]; m[ 7] = dm & tp[ 7];
+			m[ 8] = dm & tp[ 8]; m[ 9] = dm & tp[ 9];
+			m[10] = dm & tp[10]; m[11] = dm & tp[11];
+			m[12] = dm & tp[12]; m[13] = dm & tp[13];
+			m[14] = dm & tp[14]; m[15] = dm & tp[15];
+			tp += 16;
+			#pragma unroll 7
+			for (int v = 1; v < 8; v ++) {
+				dm = -((h1[i]>>v) & 1);
+				m[ 0] ^= dm & tp[ 0]; m[ 1] ^= dm & tp[ 1];
+				m[ 2] ^= dm & tp[ 2]; m[ 3] ^= dm & tp[ 3];
+				m[ 4] ^= dm & tp[ 4]; m[ 5] ^= dm & tp[ 5];
+				m[ 6] ^= dm & tp[ 6]; m[ 7] ^= dm & tp[ 7];
+				m[ 8] ^= dm & tp[ 8]; m[ 9] ^= dm & tp[ 9];
+				m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11];
+				m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13];
+				m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15];
+				tp += 16;
+			}
+			#pragma unroll
+			for (int u = 1; u < 8; u ++) {
+				#pragma unroll 8
+				for (int v = 0; v < 8; v ++) {
+					dm = -((h1[i+u]>>v) & 1);
+					m[ 0] ^= dm & tp[ 0]; m[ 1] ^= dm & tp[ 1];
+					m[ 2] ^= dm & tp[ 2]; m[ 3] ^= dm & tp[ 3];
+					m[ 4] ^= dm & tp[ 4]; m[ 5] ^= dm & tp[ 5];
+					m[ 6] ^= dm & tp[ 6]; m[ 7] ^= dm & tp[ 7];
+					m[ 8] ^= dm & tp[ 8]; m[ 9] ^= dm & tp[ 9];
+					m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11];
+					m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13];
+					m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15];
+					tp += 16;
+				}
+			}
+
+			#pragma unroll 6
+			for( int r = 0; r < 6; r++ ) {
+				ROUND_BIG(r, d_alpha_n);
+			}
+			/* order is (no more) important */
+			h[ 0] ^= m[ 0]; h[ 1] ^= m[ 1]; h[ 2] ^= c[ 0]; h[ 3] ^= c[ 1];
+			h[ 4] ^= m[ 2]; h[ 5] ^= m[ 3]; h[ 6] ^= c[ 2]; h[ 7] ^= c[ 3];
+			h[ 8] ^= m[ 8]; h[ 9] ^= m[ 9]; h[10] ^= c[ 8]; h[11] ^= c[ 9];
+			h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11];
+
+			*(uint16*)&c[ 0] = *(uint16*)&h[ 0];
+		}
+
+		*(uint2x4*)&m[ 0] = *(uint2x4*)&d_T512[112];
+		*(uint2x4*)&m[ 8] = *(uint2x4*)&d_T512[120];
+
+		#pragma unroll 6
+		for( int r = 0; r < 6; r++ ) {
+			ROUND_BIG(r, d_alpha_n);
+		}
+
+		/* order is (no more) important */
+		h[ 0] ^= m[ 0]; h[ 1] ^= m[ 1]; h[ 2] ^= c[ 0]; h[ 3] ^= c[ 1];
+		h[ 4] ^= m[ 2]; h[ 5] ^= m[ 3]; h[ 6] ^= c[ 2]; h[ 7] ^= c[ 3];
+		h[ 8] ^= m[ 8]; h[ 9] ^= m[ 9]; h[10] ^= c[ 8]; h[11] ^= c[ 9];
+		h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11];
+
+		*(uint16*)&c[ 0] = *(uint16*)&h[ 0];
+
+		*(uint2x4*)&m[ 0] = *(uint2x4*)&d_T512[784];
+		*(uint2x4*)&m[ 8] = *(uint2x4*)&d_T512[792];
+
+		#pragma unroll 12
+		for( int r = 0; r < 12; r++ )
+			ROUND_BIG(r, d_alpha_f);
+
+		/* order is (no more) important */
+		h[ 0] ^= m[ 0]; h[ 1] ^= m[ 1]; h[ 2] ^= c[ 0]; h[ 3] ^= c[ 1];
+		h[ 4] ^= m[ 2]; h[ 5] ^= m[ 3]; h[ 6] ^= c[ 2]; h[ 7] ^= c[ 3];
+		h[ 8] ^= m[ 8]; h[ 9] ^= m[ 9]; h[10] ^= c[ 8]; h[11] ^= c[ 9];
+		h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11];
+
+		*(uint2x4*)&Hash[ 0] = *(uint2x4*)&h[ 0];
+		*(uint2x4*)&Hash[ 8] = *(uint2x4*)&h[ 8];
+
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(Hash[i]);
+	}
+}
+
+__host__
+void x13_hamsi512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 384;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x13_hamsi512_gpu_hash_64_alexis<<<grid, block>>>(threads, d_hash);
+
+}
diff --git a/x15/cuda_x14_shabal512_alexis.cu b/x15/cuda_x14_shabal512_alexis.cu
index 75eaaaff92..a8a7226d0a 100644
--- a/x15/cuda_x14_shabal512_alexis.cu
+++ b/x15/cuda_x14_shabal512_alexis.cu
@@ -2,7 +2,7 @@
  * Shabal-512 for X14/X15
  * Provos Alexis - 2016
  */
-#include "cuda_helper.h"
+#include "cuda_helper_alexis.h"
 #include "cuda_vectors_alexis.h"
 
 /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
@@ -45,7 +45,7 @@ __device__ __forceinline__ void PERM_ELT(uint32_t &xa0,const uint32_t xa1,uint32
 		#else
 			tmp = (xb2 & ~xb3) ^ xm;
 		#endif
-		
+
 		xa0 = ((xa0 ^ xc ^ (ROTL32(xa1, 15) * 5U)) * 3U) ^ xb1 ^ tmp;
 		xb0 = xor3x(0xFFFFFFFF, xa0, ROTL32(xb0, 1));
 }
@@ -120,7 +120,7 @@ void x14_shabal512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash){
 	if (thread < threads){
 
 		uint32_t *Hash = &g_hash[thread<<4];
-		
+
 		*(uint2x4*)&M[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&M[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
 
@@ -133,7 +133,7 @@ void x14_shabal512_gpu_hash_64_alexis(uint32_t threads, uint32_t *g_hash){
 		ADD_BLOCK(A,C);
 		*(uint16*)&C[ 0]-= *(uint16*)&M[ 0];
 //		SWAP_BC;
-		
+
 		M[ 0] = 0x80;
 		M[ 1] = M[ 2] = M[ 3] = M[ 4] = M[ 5] = M[ 6] = M[ 7] = M[ 8] = M[ 9] = M[10] = M[11] = M[12] = M[13] = M[14] = M[15] = 0;
 		C[ 0]+= M[ 0];
@@ -196,7 +196,7 @@ void x14_shabal512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __r
 	if (thread < threads){
 
 		const uint32_t *Hash = &g_hash[thread<<4];
-		
+
 		*(uint2x4*)&M[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&M[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
 
@@ -209,7 +209,7 @@ void x14_shabal512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __r
 		ADD_BLOCK(A,C);
 		*(uint16*)&C[ 0]-= *(uint16*)&M[ 0];
 //		SWAP_BC;
-		
+
 		M[ 0] = 0x80;
 		M[ 1] = M[ 2] = M[ 3] = M[ 4] = M[ 5] = M[ 6] = M[ 7] = M[ 8] = M[ 9] = M[10] = M[11] = M[12] = M[13] = M[14] = M[15] = 0;
 		C[ 0]+= M[ 0];
@@ -240,11 +240,11 @@ void x14_shabal512_gpu_hash_64_final_alexis(uint32_t threads,const uint32_t* __r
 		PERM_ELT(A[10], A[ 9], B[ 2], B[15], B[11], B[ 8], C[ 6], M[ 2]); PERM_ELT(A[11], A[10], B[ 3], B[ 0], B[12], B[ 9], C[ 5], M[ 3]);
 		PERM_ELT(A[ 0], A[11], B[ 4], B[ 1], B[13], B[10], C[ 4], M[ 4]); PERM_ELT(A[ 1], A[ 0], B[ 5], B[ 2], B[14], B[11], C[ 3], M[ 5]);
 		PERM_ELT(A[ 2], A[ 1], B[ 6], B[ 3], B[15], B[12], C[ 2], M[ 6]); PERM_ELT(A[ 3], A[ 2], B[ 7], B[ 4], B[ 0], B[13], C[ 1], M[ 7]);
-		
+
 		if(*(uint64_t*)&B[ 6] <= target){
 			uint32_t tmp = atomicExch(&resNonce[0], thread);
 			if (tmp != UINT32_MAX)
-				resNonce[1] = tmp;		
+				resNonce[1] = tmp;
 		}
 	}
 }
diff --git a/x16r/cuda_x16r.h b/x16r/cuda_x16r.h
index 13eb743288..8150ba5e5d 100644
--- a/x16r/cuda_x16r.h
+++ b/x16r/cuda_x16r.h
@@ -3,6 +3,7 @@
 extern void x11_echo512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void x11_luffa512_cpu_hash_64_alexis(int thr_id, uint32_t threads,uint32_t *d_hash);
 extern void x11_shavite512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void x13_hamsi512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void x13_fugue512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void x14_shabal512_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash);
 
@@ -78,4 +79,3 @@ void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t
 
 void x16_sha512_setBlock_80(void *pdata);
 void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
-
diff --git a/x16r/x16r.cu b/x16r/x16r.cu
index dc44bfded9..b295426130 100644
--- a/x16r/x16r.cu
+++ b/x16r/x16r.cu
@@ -267,10 +267,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput); // 64
 		x16_echo512_cuda_init(thr_id, throughput);
-		x13_hamsi512_cpu_init(thr_id, throughput);
-		x13_fugue512_cpu_init(thr_id, throughput);
 		x16_fugue512_cpu_init(thr_id, throughput);
-		// x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
@@ -286,11 +283,6 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		((uint32_t*)ptarget)[7] = 0x003f;
 		((uint32_t*)pdata)[1] = 0xEFCDAB89;
 		((uint32_t*)pdata)[2] = 0x67452301;
-		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
-		//((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64
-		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
-		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
-		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
 	}
 	uint32_t _ALIGN(64) endiandata[20];
 
@@ -498,17 +490,15 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 				TRACE("echo   :");
 				break;
 			case HAMSI:
-				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				x13_hamsi512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
-				//x13_fugue512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
-                x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				x13_fugue512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
-				// x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-                x14_shabal512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
+				x14_shabal512_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL: