v3.9.1

2025-09-17 23:44:27 +00:00 · 2019-05-30 16:59:49 -04:00
parent eb3f57bfc7
commit 77c5ae80ab
82 changed files with 6906 additions and 3706 deletions
--- a/algo/argon2/argon2d/argon2d/argon2d_thread.c
+++ b/algo/argon2/argon2d/argon2d/argon2d_thread.c
@@ -17,7 +17,7 @@

 #if !defined(ARGON2_NO_THREADS)

-#include "thread.h"
+#include "argon2d_thread.h"
 #if defined(_WIN32)
 #include <windows.h>
 #endif
--- a/algo/argon2/argon2d/argon2d/argon2d_thread.h
+++ b/algo/argon2/argon2d/argon2d/argon2d_thread.h
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -30,7 +30,7 @@
 #include <string.h>

 #include "core.h"
-#include "thread.h"
+#include "argon2d_thread.h"
 #include "../blake2/blake2.h"
 #include "../blake2/blake2-impl.h"

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -37,7 +37,7 @@
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY__ 1

-#ifdef __SSE4_2__
+//#ifdef __SSE4_2__

 #ifdef __cplusplus
 extern "C"{
@@ -57,19 +57,22 @@ extern "C"{
 // Blake-256 4 way

 typedef struct {
-   __m128i buf[16] __attribute__ ((aligned (64)));
-   __m128i H[8];
-   __m128i S[4];    
+   unsigned char buf[64<<2];
+   uint32_t H[8<<2];
+   uint32_t S[4<<2];
+//   __m128i buf[16] __attribute__ ((aligned (64)));
+//   __m128i H[8];
+//   __m128i S[4];    
   size_t ptr;
-   sph_u32 T0, T1;
+   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
-} blake_4way_small_context;
+} blake_4way_small_context __attribute__ ((aligned (64)));

 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
-void blake256_4way_init(void *cc);
-void blake256_4way(void *cc, const void *data, size_t len);
-void blake256_4way_close(void *cc, void *dst);
+void blake256_4way_init(void *ctx);
+void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
@@ -132,12 +135,10 @@ void blake512_4way_close(void *cc, void *dst);
 void blake512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+#endif  // AVX2

 #ifdef __cplusplus
 }
 #endif

-#endif
-
-#endif
+#endif  // BLAKE_HASH_4WAY_H__
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -30,9 +30,10 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

-#if defined (__SSE4_2__)
+//#if defined (__SSE4_2__)

 #include <stddef.h>
+#include <stdint.h>
 #include <string.h>
 #include <limits.h>

@@ -60,26 +61,12 @@ extern "C"{

 // Blake-256

-static const sph_u32 IV256[8] = {
-	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
-	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
-	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
-	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+static const uint32_t IV256[8] =
+{
+	0x6A09E667, 0xBB67AE85,	0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C,	0x1F83D9AB, 0x5BE0CD19
 };

-#if defined (__AVX2__)
-
-// Blake-512
-
-static const sph_u64 IV512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
-#endif
-
 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

 // Blake-256 4 & 8 way, Blake-512 4 way
@@ -317,47 +304,6 @@ static const sph_u32 CS[16] = {

 #endif

-#if defined(__AVX2__)
-
-// Blake-512 4 way
-
-#define CBx(r, i)   CBx_(Z ## r ## i)
-#define CBx_(n)     CBx__(n)
-#define CBx__(n)    CB ## n
-
-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
-
-#if SPH_COMPACT_BLAKE_64
-// not used
-static const sph_u64 CB[16] = {
-	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
-};
-
-#endif
-
-#endif

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
@@ -411,125 +357,41 @@ do { \

 #endif

-#if defined (__AVX2__)
-
-// Blake-256 8 way
-
-#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
-
-#define ROUND_S_8WAY(r)   do { \
-        GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
-        GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
-        GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
-        GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
-        GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
-        GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
-        GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
-        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
-
-// Blake-512 4 way
-
-#define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
-   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
-   c = _mm256_add_epi64( c, d ); \
-   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
-   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
-   c = _mm256_add_epi64( c, d ); \
-   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
-} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-// not used
-#define ROUND_B_4WAY(r)   do { \
-	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
-	GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-		CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
-	GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-		CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
-	GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-		CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
-	GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-		CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
-	GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-		CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
-	GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-		CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
-	GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-		CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
-} while (0)
-
-#else
-//current_impl
-#define ROUND_B_4WAY(r)   do { \
-	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
-	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
-	GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
-	GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
-	GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
-	GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
-	GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
-	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-#endif
-
-#endif
-
-// Blake-256 4 way
-
 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
 	__m128i S0, S1, S2, S3; \
-        sph_u32 T0, T1;
+        uint32_t T0, T1;

 #define READ_STATE32_4WAY(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
+		H0 = casti_m128i( state->H, 0 ); \
+		H1 = casti_m128i( state->H, 1 ); \
+		H2 = casti_m128i( state->H, 2 ); \
+		H3 = casti_m128i( state->H, 3 ); \
+		H4 = casti_m128i( state->H, 4 ); \
+		H5 = casti_m128i( state->H, 5 ); \
+		H6 = casti_m128i( state->H, 6 ); \
+		H7 = casti_m128i( state->H, 7 ); \
+		S0 = casti_m128i( state->S, 0 ); \
+		S1 = casti_m128i( state->S, 1 ); \
+		S2 = casti_m128i( state->S, 2 ); \
+		S3 = casti_m128i( state->S, 3 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)

 #define WRITE_STATE32_4WAY(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
+		casti_m128i( state->H, 0 ) = H0; \
+		casti_m128i( state->H, 1 ) = H1; \
+		casti_m128i( state->H, 2 ) = H2; \
+		casti_m128i( state->H, 3 ) = H3; \
+		casti_m128i( state->H, 4 ) = H4; \
+		casti_m128i( state->H, 5 ) = H5; \
+		casti_m128i( state->H, 6 ) = H6; \
+		casti_m128i( state->H, 7 ) = H7; \
+		casti_m128i( state->S, 0 ) = S0; \
+		casti_m128i( state->S, 1 ) = S1; \
+		casti_m128i( state->S, 2 ) = S2; \
+		casti_m128i( state->S, 3 ) = S3; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)
@@ -616,30 +478,30 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-   V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-   VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-   VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm128_bswap_32( * buf ); \
-   M1 = mm128_bswap_32( *(buf+1) ); \
-   M2 = mm128_bswap_32( *(buf+2) ); \
-   M3 = mm128_bswap_32( *(buf+3) ); \
-   M4 = mm128_bswap_32( *(buf+4) ); \
-   M5 = mm128_bswap_32( *(buf+5) ); \
-   M6 = mm128_bswap_32( *(buf+6) ); \
-   M7 = mm128_bswap_32( *(buf+7) ); \
-   M8 = mm128_bswap_32( *(buf+8) ); \
-   M9 = mm128_bswap_32( *(buf+9) ); \
-   MA = mm128_bswap_32( *(buf+10) ); \
-   MB = mm128_bswap_32( *(buf+11) ); \
-   MC = mm128_bswap_32( *(buf+12) ); \
-   MD = mm128_bswap_32( *(buf+13) ); \
-   ME = mm128_bswap_32( *(buf+14) ); \
-   MF = mm128_bswap_32( *(buf+15) ); \
+   M0 = mm128_bswap_32( buf[ 0] ); \
+   M1 = mm128_bswap_32( buf[ 1] ); \
+   M2 = mm128_bswap_32( buf[ 2] ); \
+   M3 = mm128_bswap_32( buf[ 3] ); \
+   M4 = mm128_bswap_32( buf[ 4] ); \
+   M5 = mm128_bswap_32( buf[ 5] ); \
+   M6 = mm128_bswap_32( buf[ 6] ); \
+   M7 = mm128_bswap_32( buf[ 7] ); \
+   M8 = mm128_bswap_32( buf[ 8] ); \
+   M9 = mm128_bswap_32( buf[ 9] ); \
+   MA = mm128_bswap_32( buf[10] ); \
+   MB = mm128_bswap_32( buf[11] ); \
+   MC = mm128_bswap_32( buf[12] ); \
+   MD = mm128_bswap_32( buf[13] ); \
+   ME = mm128_bswap_32( buf[14] ); \
+   MF = mm128_bswap_32( buf[15] ); \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -673,6 +535,31 @@ do { \

 // Blake-256 8 way

+#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_8WAY(r)   do { \
+        GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
   __m256i S0, S1, S2, S3; \
@@ -787,312 +674,136 @@ do { \
                                                              S3 ), H7 ); \
 } while (0)

-// Blake-512 4 way
-
-#define DECL_STATE64_4WAY \
-	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-        __m256i S0, S1, S2, S3; \
-	sph_u64 T0, T1;
-
-#define READ_STATE64_4WAY(state)   do { \
-		H0 = (state)->H[0]; \
-		H1 = (state)->H[1]; \
-		H2 = (state)->H[2]; \
-		H3 = (state)->H[3]; \
-		H4 = (state)->H[4]; \
-		H5 = (state)->H[5]; \
-		H6 = (state)->H[6]; \
-		H7 = (state)->H[7]; \
-		S0 = (state)->S[0]; \
-		S1 = (state)->S[1]; \
-		S2 = (state)->S[2]; \
-		S3 = (state)->S[3]; \
-		T0 = (state)->T0; \
-		T1 = (state)->T1; \
-	} while (0)
-
-#define WRITE_STATE64_4WAY(state)   do { \
-		(state)->H[0] = H0; \
-		(state)->H[1] = H1; \
-		(state)->H[2] = H2; \
-		(state)->H[3] = H3; \
-		(state)->H[4] = H4; \
-		(state)->H[5] = H5; \
-		(state)->H[6] = H6; \
-		(state)->H[7] = H7; \
-		(state)->S[0] = S0; \
-		(state)->S[1] = S1; \
-		(state)->S[2] = S2; \
-		(state)->S[3] = S3; \
-		(state)->T0 = T0; \
-		(state)->T1 = T1; \
-	} while (0)
-
-#if SPH_COMPACT_BLAKE_64
-
-// not used
-#define COMPRESS64_4WAY   do { \
-	__m256i M[16]; \
-	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-	unsigned r; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
-        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
-        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
-        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
-        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
-        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
-        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
-        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_bswap_64( *(buf+0) ); \
-	M[0x1] = mm256_bswap_64( *(buf+1) ); \
-	M[0x2] = mm256_bswap_64( *(buf+2) ); \
-	M[0x3] = mm256_bswap_64( *(buf+3) ); \
-	M[0x4] = mm256_bswap_64( *(buf+4) ); \
-	M[0x5] = mm256_bswap_64( *(buf+5) ); \
-	M[0x6] = mm256_bswap_64( *(buf+6) ); \
-	M[0x7] = mm256_bswap_64( *(buf+7) ); \
-	M[0x8] = mm256_bswap_64( *(buf+8) ); \
-	M[0x9] = mm256_bswap_64( *(buf+9) ); \
-	M[0xA] = mm256_bswap_64( *(buf+10) ); \
-	M[0xB] = mm256_bswap_64( *(buf+11) ); \
-	M[0xC] = mm256_bswap_64( *(buf+12) ); \
-	M[0xD] = mm256_bswap_64( *(buf+13) ); \
-	M[0xE] = mm256_bswap_64( *(buf+14) ); \
-	M[0xF] = mm256_bswap_64( *(buf+15) ); \
-	for (r = 0; r < 16; r ++) \
-		ROUND_B_4WAY(r); \
-        H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-        H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-        H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-        H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-        H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-        H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-        H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-        H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                    _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-	} while (0)
-
-#else
-
-//current impl
-
-#define COMPRESS64_4WAY   do { \
-     __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-     __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
-     __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-     __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-     V0 = H0; \
-     V1 = H1; \
-     V2 = H2; \
-     V3 = H3; \
-     V4 = H4; \
-     V5 = H5; \
-     V6 = H6; \
-     V7 = H7; \
-     V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) );  \
-     V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) );  \
-     VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) );  \
-     VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) );  \
-     VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) );  \
-     VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) );  \
-     VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
-     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_bswap_64( *(buf + 0) ); \
-     M1 = mm256_bswap_64( *(buf + 1) ); \
-     M2 = mm256_bswap_64( *(buf + 2) ); \
-     M3 = mm256_bswap_64( *(buf + 3) ); \
-     M4 = mm256_bswap_64( *(buf + 4) ); \
-     M5 = mm256_bswap_64( *(buf + 5) ); \
-     M6 = mm256_bswap_64( *(buf + 6) ); \
-     M7 = mm256_bswap_64( *(buf + 7) ); \
-     M8 = mm256_bswap_64( *(buf + 8) ); \
-     M9 = mm256_bswap_64( *(buf + 9) ); \
-     MA = mm256_bswap_64( *(buf + 10) ); \
-     MB = mm256_bswap_64( *(buf + 11) ); \
-     MC = mm256_bswap_64( *(buf + 12) ); \
-     MD = mm256_bswap_64( *(buf + 13) ); \
-     ME = mm256_bswap_64( *(buf + 14) ); \
-     MF = mm256_bswap_64( *(buf + 15) ); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     ROUND_B_4WAY(6); \
-     ROUND_B_4WAY(7); \
-     ROUND_B_4WAY(8); \
-     ROUND_B_4WAY(9); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-     H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-     H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-     H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-     H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-     H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-     H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-     H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-	} while (0)
-
-#endif

 #endif

 // Blake-256 4 way

-static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };
+static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 };

 static void
-blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt, int rounds )
+blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
+                   const uint32_t *salt, int rounds )
 {
-   int i;
-   for ( i = 0; i < 8; i++ )
-      sc->H[i] = _mm_set1_epi32( iv[i] );
-   for ( i = 0; i < 4; i++ )
-      sc->S[i] = _mm_set1_epi32( salt[i] );
-   sc->T0 = sc->T1 = 0;
-   sc->ptr = 0;
-   sc->rounds = rounds;
+   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
+   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
+   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
+   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
+   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
+   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
+   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
+   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
+
+   casti_m128i( ctx->S, 0 ) = m128_zero;
+   casti_m128i( ctx->S, 1 ) = m128_zero;
+   casti_m128i( ctx->S, 2 ) = m128_zero;
+   casti_m128i( ctx->S, 3 ) = m128_zero;
+/*
+   sc->S[0] = _mm_set1_epi32( salt[0] );
+   sc->S[1] = _mm_set1_epi32( salt[1] );
+   sc->S[2] = _mm_set1_epi32( salt[2] );
+   sc->S[3] = _mm_set1_epi32( salt[3] );
+*/
+   ctx->T0 = ctx->T1 = 0;
+   ctx->ptr = 0;
+   ctx->rounds = rounds;
 }

 static void
-blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
+blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
-   __m128i *buf;
-   size_t ptr;
-   const int buf_size = 64;   // number of elements, sizeof/4
+   __m128i *buf = (__m128i*)ctx->buf;
+   size_t  bptr = ctx->ptr<<2;
+   size_t  vptr = ctx->ptr >> 2;
+   size_t  blen = len << 2;
   DECL_STATE32_4WAY
-   buf = sc->buf;
-   ptr = sc->ptr;
-   if ( len < buf_size - ptr )
+
+   if ( blen < (sizeof ctx->buf) - bptr )
   {
-      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
-      ptr += len;
-      sc->ptr = ptr;
+      memcpy( buf + vptr, data, (sizeof ctx->buf) - bptr );
+      bptr += blen;
+      ctx->ptr = bptr>>2;
      return;
   }

-   READ_STATE32_4WAY(sc);
-   while ( len > 0 )
+   READ_STATE32_4WAY( ctx );
+   while ( blen > 0 )
   {
-      size_t clen;
+      size_t clen = ( sizeof ctx->buf ) - bptr;

-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
-      ptr += clen;
-      vdata += (clen>>2);
-      len -= clen;
-      if ( ptr == buf_size )
+      if ( clen > blen )
+	 clen = blen;
+      memcpy( buf + vptr, data, clen );
+      bptr += clen;
+      data = (const unsigned char *)data + clen;
+      blen -= clen;
+      if ( bptr == ( sizeof ctx->buf ) )
      {
-         if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
-            T1 = SPH_T32(T1 + 1);
-         COMPRESS32_4WAY( sc->rounds );
-         ptr = 0;
+         if ( ( T0 = T0 + 512 ) < 512 )
+            T1 = T1 + 1;
+         COMPRESS32_4WAY( ctx->rounds );
+	 bptr = 0;
      }
   }
-   WRITE_STATE32_4WAY(sc);
-   sc->ptr = ptr;
+   WRITE_STATE32_4WAY( ctx );
+   ctx->ptr = bptr>>2;
 }

 static void
-blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
+blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
               void *dst, size_t out_size_w32 )
 {
-//   union {
-	__m128i buf[16];
-//	sph_u32 dummy;
-//   } u;
-   size_t ptr, k;
-   unsigned bit_len;
-   sph_u32 th, tl;
-   __m128i *out;
-
-   ptr = sc->ptr;
-   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = _mm_set1_epi32( 0x80 );
-   tl = sc->T0 + bit_len;
-   th = sc->T1;
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   size_t   ptr     = ctx->ptr;
+   size_t   vptr    = ctx->ptr>>2;
+   unsigned bit_len = ( (unsigned)ptr << 3 );
+   uint32_t tl      = ctx->T0 + bit_len;
+   uint32_t th      = ctx->T1;

   if ( ptr == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00UL);
-	sc->T1 = SPH_C32(0xFFFFFFFFUL);
+      ctx->T0 = 0xFFFFFE00UL;
+      ctx->T1 = 0xFFFFFFFFUL;
   }
-   else if ( sc->T0 == 0 )
+   else if ( ctx->T0 == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
-	sc->T1 = SPH_T32(sc->T1 - 1);
+      ctx->T0 = 0xFFFFFE00UL + bit_len;
+      ctx->T1 = ctx->T1 - 1;
   } 
   else
-	sc->T0 -= 512 - bit_len;
+      ctx->T0 -= 512 - bit_len;

-   if ( ptr <= 52 )
+   buf[vptr] = _mm_set1_epi32( 0x80 );
+
+   if ( vptr < 12 )
   {
-       memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-       if (out_size_w32 == 8)
-           buf[52>>2] = _mm_or_si128( buf[52>>2],
-                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
-       *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
-       blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
+      memset_zero_128( buf + vptr + 1, 13 - vptr  );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
+      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      blake32_4way( ctx, buf + vptr, 64 - ptr );
   }
   else
   {
-	memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-	blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
-	sc->T0 = SPH_C32(0xFFFFFE00UL);
-	sc->T1 = SPH_C32(0xFFFFFFFFUL);
-	memset_zero_128( buf, 56>>2 );
-       if (out_size_w32 == 8)
-           buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
-        *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
-	blake32_4way( sc, buf, 64 );
+      memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
+      blake32_4way( ctx, buf + vptr, 64 - ptr );
+      ctx->T0 = 0xFFFFFE00UL;
+      ctx->T1 = 0xFFFFFFFFUL;
+      memset_zero_128( buf, 56>>2 );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
+      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      blake32_4way( ctx, buf, 64 );
   }
-   out = (__m128i*)dst;
-   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm128_bswap_32( sc->H[k] );
+
+   casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
+   casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
+   casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
+   casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
+   casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
+   casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
+   casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
+   casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
 }

 #if defined (__AVX2__)
@@ -1217,163 +928,32 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        out[k] = mm256_bswap_32( sc->H[k] );
 }

-// Blake-512 4 way
-
-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
-
-static void
-blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
-              const sph_u64 *salt )
-{
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set1_epi64x( iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set1_epi64x( salt[i] );
-        sc->T0 = sc->T1 = 0;
-        sc->ptr = 0;
-}
-
-static void
-blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
-{
-   __m256i *vdata = (__m256i*)data;
-   __m256i *buf;
-   size_t ptr;
-   DECL_STATE64_4WAY
-
-   const int buf_size = 128;  //  sizeof/8 
-
-   buf = sc->buf;
-   ptr = sc->ptr;
-   if ( len < (buf_size - ptr) )
-   {
-	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
-	ptr += len;
-	sc->ptr = ptr;
-	return;
-   }
-
-   READ_STATE64_4WAY(sc);
-   while ( len > 0 )
-   {
-	size_t clen;
-
-	clen = buf_size - ptr;
-	if ( clen > len )
-		clen = len;
-	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
-	ptr += clen;
-	vdata = vdata + (clen>>3);
-	len -= clen;
-	if (ptr == buf_size )
-        {
-		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
-			T1 = SPH_T64(T1 + 1);
-		COMPRESS64_4WAY;
-		ptr = 0;
-	}
-   }
-   WRITE_STATE64_4WAY(sc);
-   sc->ptr = ptr;
-}
-
-static void
-blake64_4way_close( blake_4way_big_context *sc,
-	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
-{
-//   union {
-      __m256i buf[16];
-//      sph_u64 dummy;
-//   } u;
-   size_t ptr, k;
-   unsigned bit_len;
-   uint64_t z, zz;
-   sph_u64 th, tl;
-   __m256i *out;
-
-   ptr = sc->ptr;
-   bit_len = ((unsigned)ptr << 3);
-   z = 0x80 >> n;
-   zz = ((ub & -z) | z) & 0xFF;
-   buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
-   tl = sc->T0 + bit_len;
-   th = sc->T1;
-   if (ptr == 0 )
-   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
-   }
-   else if ( sc->T0 == 0 )
-   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-	sc->T1 = SPH_T64(sc->T1 - 1);
-   } 
-   else
-   {
-        sc->T0 -= 1024 - bit_len;
-   }
-   if ( ptr <= 104 )
-   {
-       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
-       if ( out_size_w64 == 8 )
-          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
-                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
-
-       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
-   }
-   else
-  {
-       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
-
-       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
-       memset_zero_256( buf, 112>>3 ); 
-       if ( out_size_w64 == 8 )
-           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
-
-       blake64_4way( sc, buf, 128 );
-   }
-   out = (__m256i*)dst;
-   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_bswap_64( sc->H[k] );
-}
-
 #endif

 // Blake-256 4 way

 // default 14 rounds, backward copatibility
 void
-blake256_4way_init(void *cc)
+blake256_4way_init(void *ctx)
 {
-   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
+   blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 );
 }

 void
-blake256_4way(void *cc, const void *data, size_t len)
+blake256_4way(void *ctx, const void *data, size_t len)
 {
-	blake32_4way(cc, data, len);
+	blake32_4way(ctx, data, len);
 }

 void
-blake256_4way_close(void *cc, void *dst)
+blake256_4way_close(void *ctx, void *dst)
 {
-        blake32_4way_close(cc, 0, 0, dst, 8);
+        blake32_4way_close(ctx, 0, 0, dst, 8);
 }

 #if defined(__AVX2__)

-// Blake-256 8way
+// Blake-256 8 way

 void
 blake256_8way_init(void *cc)
@@ -1473,38 +1053,8 @@ blake256r8_8way_close(void *cc, void *dst)

 #endif

-// Blake-512 4 way
-
-#if defined (__AVX2__)
-
-void
-blake512_4way_init(void *cc)
-{
-	blake64_4way_init(cc, IV512, salt_zero_big);
-}
-
-void
-blake512_4way(void *cc, const void *data, size_t len)
-{
-	blake64_4way(cc, data, len);
-}
-
-void
-blake512_4way_close(void *cc, void *dst)
-{
-	blake512_4way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_4way_close(cc, ub, n, dst, 8);
-}
-
-#endif
-
 #ifdef __cplusplus
 }
 #endif

-#endif
+//#endif
--- a/algo/blake/blake256-hash-4way.c.new
+++ b/algo/blake/blake256-hash-4way.c.new
@@ -0,0 +1,322 @@
+// convert blake256 32 bit to use 64 bit with serial vectoring
+//
+//  cut calls to GS in half
+//
+// combine V
+// v0 = {V0,V1}
+// v1 = {V2,V3}
+// v2 = {V4,V5}
+// v3 = {V6,V7}
+// v4 = {V8,V9}
+// v5 = {VA,VB}
+// v6 = {VC,VD}
+// v7 = {CE,VF}
+//
+// v6x = {VD,VC}      swap(VC,VD)   swap(v6)
+// v7x = {VF,VE}      swap(VE,VF)   swap(v7)
+//
+// V0 = v1v0
+// V1 = v3v2
+// V2 = v5v4
+// V3 = v7v6
+// V4 = v9v8
+// V5 = vbva
+// V6 = vdvc
+// V7 = vfve
+//
+// The rotate in ROUND is to effect straddle and unstraddle for the third
+// and 4th iteration of GS.
+// It concatenates 2 contiguous 256 bit vectors and extracts the middle
+// 256 bits. After the transform they must be restored with only the
+// chosen bits modified in the original 2 vectors.
+// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
+// 256 bit vector and saves the untouched bits temporailly in arg0, the
+// "high" 256 bit vector. Simply reverse the process to restore data back
+// to original positions.
+
+// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
+//
+// Data is organised the same as 32 bit 4 way, in effect serial vectoring
+// on top of parallel vectoring. Same data in the same place just taking
+// two chunks at a time.
+//
+// Transparent to user, x2 mode used when AVX2 detected.
+// Use existing 4way context but revert to scalar types.
+// Same interleave function (128 bit) or x2 with 256 bit?
+// User trsnaparency would have to apply to interleave as well.
+//
+// Use common 4way update and close
+
+/*
+typedef struct {
+   unsigned char buf[64<<2];
+   uint32_t H[8<<2];
+   uint32_t S[4<<2];
+   size_t ptr;
+   uint32_t T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blakex2_4way_small_context __attribute__ ((aligned (64)));
+*/
+
+static void
+blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
+                   const uint32_t *salt, int rounds )
+{
+   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
+   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
+   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
+   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
+   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
+   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
+   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
+   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
+
+   casti_m128i( ctx->S, 0 ) = m128_zero;
+   casti_m128i( ctx->S, 1 ) = m128_zero;
+   casti_m128i( ctx->S, 2 ) = m128_zero;
+   casti_m128i( ctx->S, 3 ) = m128_zero;
+/*
+   sc->S[0] = _mm_set1_epi32( salt[0] );
+   sc->S[1] = _mm_set1_epi32( salt[1] );
+   sc->S[2] = _mm_set1_epi32( salt[2] );
+   sc->S[3] = _mm_set1_epi32( salt[3] );
+*/
+   ctx->T0 = ctx->T1 = 0;
+   ctx->ptr = 0;
+   ctx->rounds = rounds;
+}
+
+static void
+blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
+{
+   __m128i *buf = (__m256i*)ctx->buf;
+   size_t  bptr = ctx->ptr << 2;
+   size_t  vptr = ctx->ptr >> 3;
+   size_t  blen = len << 2;
+//    unsigned char *buf = ctx->buf;
+//    size_t ptr         = ctx->ptr<<4;  // repurposed
+    DECL_STATE32x2
+
+//    buf = sc->buf;
+//    ptr = sc->ptr;
+
+// adjust len for use with ptr, clen, all absolute bytes.
+//    int blen = len<<2;
+
+    if ( blen < (sizeof ctx->buf) - bptr )
+    {
+        memcpy( buf + vptr, data, blen );
+        ptr += blen;
+        ctx->ptr = bptr >> 2;;
+        return;
+    }
+
+    READ_STATE32( ctx );
+    while ( blen > 0 )
+    {
+        size_t clen;
+
+        clen = ( sizeof sc->buf ) - ptr;
+        if ( clen > blen )
+            clen = blen;
+        memcpy( buf + vptr, data, clen );
+        bptr += clen;
+        vptr = bptr >> 5;
+	data = (const unsigned char *)data + clen;
+        blen -= clen;
+        if ( bptr == sizeof ctx->buf )
+       	{
+           if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
+               T1 += 1;
+           COMPRESS32x2_4WAY( ctx->rounds );
+           ptr = 0;
+        }
+    }
+    WRITE_STATE32x2( ctx );
+    ctx->ptr = bptr >> 2;
+}
+
+static void
+blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
+{
+   __m256i buf[8] __attribute__ ((aligned (64)));
+   size_t   ptr     = ctx->ptr;
+   size_t   vptr    = ctx->ptr>>2;
+   unsigned bit_len = ( (unsigned)ptr << 3 );  // one lane
+   uint32_t th      = ctx->T1;
+   uint32_t tl      = ctx->T0 + bit_len;
+
+   if ( ptr == 0 )
+   {
+        ctx->T0 = 0xFFFFFE00UL;
+        ctx->T1 = 0xFFFFFFFFUL;
+   }
+   else if ( ctx->T0 == 0 )
+   {
+      ctx->T0 = 0xFFFFFE00UL + bit_len;
+      ctx->T1 -= 1;
+   }
+   else
+      ctx->T0 -= 512 - bit_len;
+
+   // memset doesn't do ints
+   buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
+
+   if ( vptr < 5 )
+   {
+       memset_zero_256( buf + vptr + 1, 6 - vptr  );
+       buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
+             0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); 
+       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
+			                            th,th,th,th ) );
+       blake32x2_4way( ctx, buf + vptr, 64 - ptr );
+   }
+   else
+   {
+       memset_zero_256( vbuf + vptr + 1, 7 - vptr );
+       blake32x2_4way( ctx,  vbuf + ptr, 64 - ptr );
+       ctx->T0 = 0xFFFFFE00UL;
+       ctx->T1 = 0xFFFFFFFFUL;
+       buf[ 6 ] = mm256_zero;
+       buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
+		         0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
+       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
+                                                    th, th, th, th );
+       blake32x2_4way( ctx, buf, 64 );
+   }
+
+   casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
+   casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
+   casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
+   casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
+}
+
+
+
+
+#define DECL_STATE32x2_4WAY \
+   __m256i H0, H1, H2, H3; \
+   __m256i S0, S1; \
+   uint32_t T0, T1;
+
+#define READ_STATE32x2_4WAY(state)  do \
+{ \
+   H0 = casti_m256i( state->H, 0 ); \
+   H1 = casti_m256i( state->H, 1 ); \
+   H2 = casti_m256i( state->H, 2 ); \
+   H3 = casti_m256i( state->H, 3 ); \
+   S0 = casti_m256i( state->S, 0 ); \
+   S1 = casti_m256i( state->S, 1 ); \
+   T0 = state->T0; \
+   T1 = state->T1; \
+
+#define WRITE_STATE32x2_4WAY(state)   do { \
+   casti_m256i( state->H, 0 ) = H0; \
+   casti_m256i( state->H, 1 ) = H1; \
+   casti_m256i( state->H, 2 ) = H2; \
+   casti_m256i( state->H, 3 ) = H3; \
+   casti_m256i( state->S, 0 ) = S0; \
+   casti_m256i( state->S, 1 ) = S1; \
+   state->T0 = T0; \
+   state->T1 = T1; \
+} while (0)
+
+
+#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
+{ \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+          _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
+	  _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
+   d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+          _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
+	  _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_Sx2_4WAY(r)   do \
+{ \
+  GS2_4WAY( Mx(r, 0),  Mx(r, 1),  Mx(r, 2),  Mx(r, 3), \
+           CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
+  GS2_4WAY( Mx(r, 4),  Mx(r, 5),  Mx(r, 6),  Mx(r, 7), \
+           CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
+  mm256_ror1x128_512( V3, V2 ); \
+  mm256_ror1x128_512( V6, V7 ); \
+  GS2_4WAY( Mx(r, 8),  Mx(r, 9),  Mx(r, A),  Mx(r, B), \
+           CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
+  GS2_4WAY( Mx(r, C),  Mx(r, D),  Mx(r, C),  Mx(r, D), \
+           CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
+  mm256_rol1x128_512( V2, V3 ); \
+  mm256_rol1x128_512( V7, V6 ); 
+
+#define COMPRESS32x2_4WAY( rounds ) do \
+{ \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+   unsigned r; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
+			                        CS0, CS0, CS0, CS0 ) ); \
+   V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
+                                                CS2, CS2, CS2, CS2 ) ); \
+   V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
+                              _mm256_set_epi32( CS5, CS5, CS5, CS5, \
+		                                CS4, CS4, CS4, CS4 ) ); \
+   V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              _mm256_set_epi32( CS7, CS7, CS7, CS7, \
+                                                CS6, CS6, CS6, CS6 ) ); \
+   M0 = mm256_bswap_32( buf[ 0] ); \
+   M1 = mm256_bswap_32( buf[ 1] ); \
+   M2 = mm256_bswap_32( buf[ 2] ); \
+   M3 = mm256_bswap_32( buf[ 3] ); \
+   M4 = mm256_bswap_32( buf[ 4] ); \
+   M5 = mm256_bswap_32( buf[ 5] ); \
+   M6 = mm256_bswap_32( buf[ 6] ); \
+   M7 = mm256_bswap_32( buf[ 7] ); \
+   ROUND_Sx2_4WAY(0); \
+   ROUND_Sx2_4WAY(1); \
+   ROUND_Sx2_4WAY(2); \
+   ROUND_Sx2_4WAY(3); \
+   ROUND_Sx2_4WAY(4); \
+   ROUND_Sx2_4WAY(5); \
+   ROUND_Sx2_4WAY(6); \
+   ROUND_Sx2_4WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_Sx2_4WAY(8); \
+      ROUND_Sx2_4WAY(9); \
+      ROUND_Sx2_4WAY(0); \
+      ROUND_Sx2_4WAY(1); \
+      ROUND_Sx2_4WAY(2); \
+      ROUND_Sx2_4WAY(3); \
+   } \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( \
+			           _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( \
+			           _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( \
+			           _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( \
+			           _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
+} while (0)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -0,0 +1,701 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#if defined (__AVX2__)
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "blake-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_SMALL_FOOTPRINT_BLAKE   1
+#endif
+
+#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
+#define SPH_COMPACT_BLAKE_64   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+
+// Blake-512
+
+static const sph_u64 IV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+
+#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+
+// Blake-256 4 & 8 way, Blake-512 4 way
+
+static const unsigned sigma[16][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
+};
+
+#endif
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+// Blake-512 4 way
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#if SPH_COMPACT_BLAKE_64
+// not used
+static const sph_u64 CB[16] = {
+	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+};
+
+#endif
+
+
+// Blake-512 4 way
+
+#define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
+                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
+   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
+                 _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi64( c, d ); \
+   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
+} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+// not used
+#define ROUND_B_4WAY(r)   do { \
+	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
+	GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+		CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
+	GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+		CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
+	GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+		CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
+	GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+		CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
+	GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+		CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
+	GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+		CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
+	GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+		CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
+} while (0)
+
+#else
+//current_impl
+#define ROUND_B_4WAY(r)   do { \
+	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+	GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+	GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+	GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+	GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+	GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+
+// Blake-512 4 way
+
+#define DECL_STATE64_4WAY \
+	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
+        __m256i S0, S1, S2, S3; \
+	sph_u64 T0, T1;
+
+#define READ_STATE64_4WAY(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE64_4WAY(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+// not used
+#define COMPRESS64_4WAY   do { \
+	__m256i M[16]; \
+	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+	__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+	unsigned r; \
+	V0 = H0; \
+	V1 = H1; \
+	V2 = H2; \
+	V3 = H3; \
+	V4 = H4; \
+	V5 = H5; \
+	V6 = H6; \
+	V7 = H7; \
+        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
+        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
+        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
+        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
+        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
+        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
+        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
+        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
+	M[0x0] = mm256_bswap_64( *(buf+0) ); \
+	M[0x1] = mm256_bswap_64( *(buf+1) ); \
+	M[0x2] = mm256_bswap_64( *(buf+2) ); \
+	M[0x3] = mm256_bswap_64( *(buf+3) ); \
+	M[0x4] = mm256_bswap_64( *(buf+4) ); \
+	M[0x5] = mm256_bswap_64( *(buf+5) ); \
+	M[0x6] = mm256_bswap_64( *(buf+6) ); \
+	M[0x7] = mm256_bswap_64( *(buf+7) ); \
+	M[0x8] = mm256_bswap_64( *(buf+8) ); \
+	M[0x9] = mm256_bswap_64( *(buf+9) ); \
+	M[0xA] = mm256_bswap_64( *(buf+10) ); \
+	M[0xB] = mm256_bswap_64( *(buf+11) ); \
+	M[0xC] = mm256_bswap_64( *(buf+12) ); \
+	M[0xD] = mm256_bswap_64( *(buf+13) ); \
+	M[0xE] = mm256_bswap_64( *(buf+14) ); \
+	M[0xF] = mm256_bswap_64( *(buf+15) ); \
+	for (r = 0; r < 16; r ++) \
+		ROUND_B_4WAY(r); \
+        H0 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
+        H1 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
+        H2 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
+        H3 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
+        H4 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
+        H5 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
+        H6 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
+        H7 = _mm256_xor_si256( _mm256_xor_si256( \
+                    _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
+	} while (0)
+
+#else
+
+//current impl
+
+#define COMPRESS64_4WAY   do { \
+     __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+     __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+     __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+     __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+     V0 = H0; \
+     V1 = H1; \
+     V2 = H2; \
+     V3 = H3; \
+     V4 = H4; \
+     V5 = H5; \
+     V6 = H6; \
+     V7 = H7; \
+     V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) );  \
+     V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) );  \
+     VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) );  \
+     VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) );  \
+     VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                            _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) );  \
+     VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                            _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) );  \
+     VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
+     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
+     M0 = mm256_bswap_64( *(buf + 0) ); \
+     M1 = mm256_bswap_64( *(buf + 1) ); \
+     M2 = mm256_bswap_64( *(buf + 2) ); \
+     M3 = mm256_bswap_64( *(buf + 3) ); \
+     M4 = mm256_bswap_64( *(buf + 4) ); \
+     M5 = mm256_bswap_64( *(buf + 5) ); \
+     M6 = mm256_bswap_64( *(buf + 6) ); \
+     M7 = mm256_bswap_64( *(buf + 7) ); \
+     M8 = mm256_bswap_64( *(buf + 8) ); \
+     M9 = mm256_bswap_64( *(buf + 9) ); \
+     MA = mm256_bswap_64( *(buf + 10) ); \
+     MB = mm256_bswap_64( *(buf + 11) ); \
+     MC = mm256_bswap_64( *(buf + 12) ); \
+     MD = mm256_bswap_64( *(buf + 13) ); \
+     ME = mm256_bswap_64( *(buf + 14) ); \
+     MF = mm256_bswap_64( *(buf + 15) ); \
+     ROUND_B_4WAY(0); \
+     ROUND_B_4WAY(1); \
+     ROUND_B_4WAY(2); \
+     ROUND_B_4WAY(3); \
+     ROUND_B_4WAY(4); \
+     ROUND_B_4WAY(5); \
+     ROUND_B_4WAY(6); \
+     ROUND_B_4WAY(7); \
+     ROUND_B_4WAY(8); \
+     ROUND_B_4WAY(9); \
+     ROUND_B_4WAY(0); \
+     ROUND_B_4WAY(1); \
+     ROUND_B_4WAY(2); \
+     ROUND_B_4WAY(3); \
+     ROUND_B_4WAY(4); \
+     ROUND_B_4WAY(5); \
+     H0 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
+     H1 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
+     H2 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
+     H3 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
+     H4 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
+     H5 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
+     H6 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
+     H7 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
+	} while (0)
+
+#endif
+
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+
+static void
+blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
+              const sph_u64 *salt )
+{
+        int i;
+        for ( i = 0; i < 8; i++ )
+           sc->H[i] = _mm256_set1_epi64x( iv[i] );
+        for ( i = 0; i < 4; i++ )
+           sc->S[i] = _mm256_set1_epi64x( salt[i] );
+        sc->T0 = sc->T1 = 0;
+        sc->ptr = 0;
+}
+
+static void
+blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   DECL_STATE64_4WAY
+
+   const int buf_size = 128;  //  sizeof/8 
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < (buf_size - ptr) )
+   {
+	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+	ptr += len;
+	sc->ptr = ptr;
+	return;
+   }
+
+   READ_STATE64_4WAY(sc);
+   while ( len > 0 )
+   {
+	size_t clen;
+
+	clen = buf_size - ptr;
+	if ( clen > len )
+		clen = len;
+	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+	ptr += clen;
+	vdata = vdata + (clen>>3);
+	len -= clen;
+	if (ptr == buf_size )
+        {
+		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+			T1 = SPH_T64(T1 + 1);
+		COMPRESS64_4WAY;
+		ptr = 0;
+	}
+   }
+   WRITE_STATE64_4WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake64_4way_close( blake_4way_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+//   union {
+      __m256i buf[16];
+//      sph_u64 dummy;
+//   } u;
+   size_t ptr, k;
+   unsigned bit_len;
+   uint64_t z, zz;
+   sph_u64 th, tl;
+   __m256i *out;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if (ptr == 0 )
+   {
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+	sc->T1 = SPH_T64(sc->T1 - 1);
+   } 
+   else
+   {
+        sc->T0 -= 1024 - bit_len;
+   }
+   if ( ptr <= 104 )
+   {
+       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       if ( out_size_w64 == 8 )
+          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
+       *(buf+(112>>3)) = mm256_bswap_64(
+                                    _mm256_set_epi64x( th, th, th, th ) );
+       *(buf+(120>>3)) = mm256_bswap_64(
+                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
+   }
+   else
+  {
+       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       memset_zero_256( buf, 112>>3 ); 
+       if ( out_size_w64 == 8 )
+           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = mm256_bswap_64(
+                                    _mm256_set_epi64x( th, th, th, th ) );
+       *(buf+(120>>3)) = mm256_bswap_64(
+                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+
+       blake64_4way( sc, buf, 128 );
+   }
+   out = (__m256i*)dst;
+   for ( k = 0; k < out_size_w64; k++ )
+       out[k] = mm256_bswap_64( sc->H[k] );
+}
+
+void
+blake512_4way_init(void *cc)
+{
+	blake64_4way_init(cc, IV512, salt_zero_big);
+}
+
+void
+blake512_4way(void *cc, const void *data, size_t len)
+{
+	blake64_4way(cc, data, len);
+}
+
+void
+blake512_4way_close(void *cc, void *dst)
+{
+	blake512_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_4way_close(cc, ub, n, dst, 8);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -7,6 +7,24 @@

 // 2x128

+// The result of hashing 10 rounds of initial data which consists of params
+// zero padded.
+static const uint64_t IV256[] =
+{
+0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
+0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
+0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
+0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
+};
+
+static const uint64_t IV512[] =
+{
+0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
+0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
+0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
+0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
+};
+
 static void transform_2way( cube_2way_context *sp )
 {
    int r;
@@ -45,10 +63,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap128_64( x4 );
-        x5 = mm256_swap128_64( x5 );
-        x6 = mm256_swap128_64( x6 );
-        x7 = mm256_swap128_64( x7 );
+        x4 = mm256_swap64_128( x4 );
+        x5 = mm256_swap64_128( x5 );
+        x6 = mm256_swap64_128( x6 );
+        x7 = mm256_swap64_128( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -69,10 +87,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_32( x4 );
-        x5 = mm256_swap64_32( x5 );
-        x6 = mm256_swap64_32( x6 );
-        x7 = mm256_swap64_32( x7 );
+        x4 = mm256_swap32_64( x4 );
+        x5 = mm256_swap32_64( x5 );
+        x6 = mm256_swap32_64( x6 );
+        x7 = mm256_swap32_64( x7 );
    }

    _mm256_store_si256( (__m256i*)sp->h,     x0 );
@@ -86,36 +104,26 @@ static void transform_2way( cube_2way_context *sp )

 }

-cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
-
-int cube_2way_reinit( cube_2way_context *sp )
-{
-   memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
-   return 0;
-}
-
 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
-                       int blockbytes )
+                    int blockbytes )
 {
-    int i;
+    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;

-    // all sizes of __m128i
-    cube_2way_ctx_cache.hashlen   = hashbitlen/128;
-    cube_2way_ctx_cache.blocksize = blockbytes/16;
-    cube_2way_ctx_cache.rounds    = rounds;
-    cube_2way_ctx_cache.pos       = 0;
+    __m256i* h = (__m256i*)sp->h;

-    for ( i = 0; i < 8; ++i )
-       cube_2way_ctx_cache.h[i] = m256_zero;
+    h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
+    h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
+    h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
+    h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
+    h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
+    h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
+    h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
+    h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );

-    cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
-                                   0, rounds, blockbytes, hashbitlen / 8,
-                                   0, rounds, blockbytes, hashbitlen / 8 );
-
-    for ( i = 0; i < 10; ++i )
-       transform_2way( &cube_2way_ctx_cache );
-
-    memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
    return 0;
 }

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -14,6 +14,25 @@
 #include <unistd.h>
 #include <memory.h>
 #include "avxdefs.h"
+#include <stdio.h>
+
+// The result of hashing 10 rounds of initial data which is params and 
+// mostly zeros.
+static const uint64_t IV256[] =
+{
+0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
+0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
+0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
+0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
+};
+
+static const uint64_t IV512[] =
+{
+0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
+0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
+0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
+0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
+};

 static void transform( cubehashParam *sp )
 {
@@ -128,48 +147,37 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

-// Cubehash context initializing is very expensive.
-// Cache the intial value for faster reinitializing.
-cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
-
-int cubehashReinit( cubehashParam *sp )
-{
-   memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
-   return SUCCESS;
-
-}
-
-// Initialize the cache then copy to sp.
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    int i;
+    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+    
+#if defined(__AVX2__)

-    if ( hashbitlen < 8 ) return BAD_HASHBITLEN;
-    if ( hashbitlen > 512 ) return BAD_HASHBITLEN;
-    if ( hashbitlen != 8 * (hashbitlen / 8) ) return BAD_HASHBITLEN;
+    __m256i* x = (__m256i*)sp->x;

-    /* Sanity checks */
-    if ( rounds <= 0 || rounds > 32 )
-       rounds = CUBEHASH_ROUNDS;
-    if ( blockbytes <= 0 || blockbytes >= 256)
-       blockbytes = CUBEHASH_BLOCKBYTES;
+    x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
+    x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
+    x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
+    x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );

-    // all sizes of __m128i
-    cube_ctx_cache.hashlen   = hashbitlen/128;
-    cube_ctx_cache.blocksize = blockbytes/16;
-    cube_ctx_cache.rounds    = rounds;
-    cube_ctx_cache.pos       = 0;
+#else

-    for ( i = 0; i < 8; ++i )
-       cube_ctx_cache.x[i] = _mm_setzero_si128();;
+    __m128i* x = (__m128i*)sp->x;

-    cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
-                                         hashbitlen / 8 );
+     x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
+     x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
+     x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
+     x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
+     x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
+     x[5] = _mm_set_epi64x( iv[11], iv[10] );
+     x[6] = _mm_set_epi64x( iv[13], iv[12] );
+     x[7] = _mm_set_epi64x( iv[15], iv[14] );

-    for ( i = 0; i < 10; ++i )
-       transform( &cube_ctx_cache );
-
-    memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
+#endif
    return SUCCESS;
 }

--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -3,7 +3,7 @@
 #include "wolf-aes.h"
 #include "miner.h"

-#ifndef NO_AES_NI
+#if defined(__AES__)

 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
    }
 }

-#else    // NO AVX
+#else    // NO SSE4.2

 static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
 {
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -101,39 +101,6 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
   g_work->data[31] = 0x00000280;
 }

-// hodl build_extra_header is redundant, hodl can use std_build_extra_header
-// and call hodl_build_block_header.
-#if 0
-void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
-{
-   uchar merkle_tree[64] = { 0 };
-   size_t t;
-//   int i;
-
-   algo_gate.gen_merkle_root( merkle_tree, sctx );
-   // Increment extranonce2
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
-          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
-          le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
-/*
-   // Assemble block header
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-
-   g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
-   g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
-   g_work->data[22] = 0x80000000;
-   g_work->data[31] = 0x00000280;
-*/
-}
-#endif
-
 // called only by thread 0, saves a backup of g_work
 void hodl_get_new_work( struct work* work, struct work* g_work)
 {
@@ -179,7 +146,7 @@ bool hodl_do_this_thread( int thr_id )
 int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
-#ifndef NO_AES_NI
+#if defined(__AES__)
  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
@@ -189,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,

 bool register_hodl_algo( algo_gate_t* gate )
 {
-#ifdef NO_AES_NI
+#if defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
@@ -207,7 +174,6 @@ bool register_hodl_algo( algo_gate_t* gate )
  gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
  gate->malloc_txs_request    = (void*)&hodl_malloc_txs_request;
  gate->build_block_header    = (void*)&hodl_build_block_header;
-//  gate->build_extraheader     = (void*)&hodl_build_extraheader;
  gate->resync_threads        = (void*)&hodl_resync_threads;
  gate->do_this_thread        = (void*)&hodl_do_this_thread;
  gate->work_cmp_size         = 76;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -8,7 +8,7 @@
 #include "hodl-wolf.h"
 #include "miner.h"

-#ifndef NO_AES_NI               
+#if defined(__AES__)               

 void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
     void *MidHash )
@@ -139,7 +139,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    return(0);


-#else  // no AVX
+#else  // no SSE4.2

    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -160,7 +160,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
        {
           // copy data to first l2 cache
           memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
-#ifndef NO_AES_NI               
           for(int j = 0; j < AES_ITERATIONS; j++)
           {
                CacheEntry TmpXOR;
@@ -184,7 +183,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
                AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
                        TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
                                                             - 1 ], 256 );                 }
-#endif
           // use last X bits as solution
           if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
                                         & (COMPARE_SIZE - 1) ) < 1000 )
@@ -206,7 +204,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    *hashes_done = CollisionCount;
    return(0);

-#endif
+#endif  // SSE4.2 else

 }

@@ -218,5 +216,5 @@ void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
 	GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
 }

-#endif
+#endif // AES

--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -22,16 +22,20 @@ typedef struct
 #ifdef __AVX2__
   __m256i h[8];
   __m256i w[80];
-#else // AVX
+#elif defined(__SSE4_2__)
   __m128i h[8];
   __m128i w[80];
+#else
+   int dummy;
 #endif
 } Sha512Context;

 #ifdef __AVX2__
 #define SHA512_PARALLEL_N 8
-#else // AVX
+#elif defined(__SSE$_2__)
 #define SHA512_PARALLEL_N 4
+#else
+#define SHA512_PARALLEL_N 1   // dummy value
 #endif

 //SHA-512 related functions
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,4 +1,5 @@
 #ifndef __AVX2__
+
 #ifdef __SSE4_2__
 //#ifdef __AVX__

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -30,6 +30,19 @@
  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
 } while(0)

+/*
+static inline __m256i mult2_avx2( a )
+{ 
+   __m128 a0, a0, b;
+   a0 = mm128_extractlo_256( a );
+   a1 = mm128_extracthi_256( a );
+   b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
+   a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
+   a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
+   return mm256_concat_128( a1, a0 );
+}
+*/
+
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -55,11 +55,11 @@ void allium_4way_hash( void *state, const void *input )
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -27,7 +27,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  gate->set_target        = (void*)&alt_set_target;
  return true;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -17,14 +17,14 @@ bool register_lyra2rev3_algo( algo_gate_t* gate );

 void lyra2rev3_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done );
+                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_4way_ctx();

 #else

 void lyra2rev3_hash( void *state, const void *input );
 int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done );
+                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_ctx();

 #endif
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -7,8 +7,7 @@
 #include "lyra2.h"
 #include "algo-gate-api.h"
 #include "avxdefs.h"
-
-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif

@@ -18,10 +17,10 @@ typedef struct {
        sph_blake256_context     blake;
        sph_keccak256_context    keccak;
        sph_skein256_context     skein;
-#ifdef NO_AES_NI
-        sph_groestl256_context   groestl;
-#else
+#if defined(__AES__)
        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
 #endif
 } lyra2re_ctx_holder;

@@ -33,10 +32,10 @@ void init_lyra2re_ctx()
        sph_blake256_init(&lyra2re_ctx.blake);
        sph_keccak256_init(&lyra2re_ctx.keccak);
        sph_skein256_init(&lyra2re_ctx.skein);
-#ifdef NO_AES_NI
-        sph_groestl256_init(&lyra2re_ctx.groestl);
-#else
+#if defined(__AES__)
        init_groestl256( &lyra2re_ctx.groestl, 32 );
+#else
+        sph_groestl256_init(&lyra2re_ctx.groestl);
 #endif
 }

@@ -72,11 +71,11 @@ void lyra2re_hash(void *state, const void *input)
 	sph_skein256(&ctx.skein, hashA, 32);
 	sph_skein256_close(&ctx.skein, hashB);

-#ifdef NO_AES_NI
+#if defined(__AES__)
+        update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
+#else
 	sph_groestl256( &ctx.groestl, hashB, 32 );
 	sph_groestl256_close( &ctx.groestl, hashA );
-#else
-        update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
 #endif

 	memcpy(state, hashA, 32);
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -48,11 +48,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -65,13 +65,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   skein256_4way_close( &ctx.skein, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -43,11 +43,11 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
   
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   cubehashReinit( &ctx.cube );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -57,54 +57,67 @@ void lyra2rev3_4way_hash( void *state, const void *input )

   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
+   bmw256_4way_close( &ctx.bmw, state );

-   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done )
+                             uint64_t *hashes_done, struct thr_info *mythr ) 
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
-
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
+   // Need big endian data
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

-   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+   do
+   {
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev3_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+         if ( fulltest( lane_hash, ptarget ) )
+         {
+              pdata[19] = n + lane;    
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+		             accepted_share_count + rejected_share_count + 1,
+			     thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
+	 }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -8,7 +8,6 @@

 typedef struct {
        cubehashParam           cube;
-//        cubehashParam           cube2;
        sph_blake256_context     blake;
        sph_bmw256_context       bmw;

@@ -20,7 +19,6 @@ static __thread sph_blake256_context l2v3_blake_mid;
 bool init_lyra2rev3_ctx()
 {
        cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 );
-//        cubehashInit( &lyra2v3_ctx.cube2, 256, 16, 32 );
        sph_blake256_init( &lyra2v3_ctx.blake );
        sph_bmw256_init( &lyra2v3_ctx.bmw );
        return true;
@@ -59,44 +57,51 @@ void lyra2rev3_hash( void *state, const void *input )
 	memcpy( state, hash, 32 );
 }

-int scanhash_lyra2rev3(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_lyra2rev3( int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-        uint32_t hash[8] __attribute__((aligned(64)));
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-        const uint32_t Htarg = ptarget[7];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+   if (opt_benchmark)
+	((uint32_t*)ptarget)[7] = 0x0000ff;

-        swab32_array( endiandata, pdata, 20 );
+   // need big endian data
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

-        l2v3_blake256_midstate( endiandata );
+   l2v3_blake256_midstate( endiandata );

-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2rev3_hash(hash, endiandata);
+   do
+   {
+	be32enc(&endiandata[19], nonce);
+	lyra2rev3_hash(hash, endiandata);

-		if (hash[7] <= Htarg )
-                {
-                   if( fulltest(hash, ptarget) )
-                   {
-			pdata[19] = nonce;
-                        work_set_target_ratio( work, hash );
-			*hashes_done = pdata[19] - first_nonce;
-		   	return 1;
-		   }
-                }
-		nonce++;
+	if (hash[7] <= Htarg )
+        {
+            if( fulltest(hash, ptarget) )
+            {
+		pdata[19] = nonce;
+                work_set_target_ratio( work, hash );
+                *hashes_done = pdata[19] - first_nonce;
+		return 1;
+	    }
+         }
+         nonce++;

-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);

-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \

-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)

 // process 2 columns in parallel
 // returns void, all args updated
@@ -108,14 +108,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror256_1x64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   mm128_rol256_1x64( s6, s7 ); \
+   mm128_rol1x64_256( s2, s3 ); \
+   mm128_swap128_256( s4, s5 ); \
+   mm128_rol1x64_256( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol256_1x64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   mm128_ror256_1x64( s6, s7 );
+   mm128_rol1x64_256( s2, s3 ); \
+   mm128_swap128_256( s4, s5 ); \
+   mm128_ror1x64_256( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -7,7 +7,6 @@
 #include <string.h>
 #include <float.h>
 #include <math.h>
-#include "algo/sha/sph_sha2.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
@@ -117,13 +116,8 @@ uint32_t sw2_(int nnounce)
 }

 typedef struct {
-#ifndef USE_SPH_SHA
    SHA256_CTX               sha256;
    SHA512_CTX               sha512;
-#else
-    sph_sha256_context       sha256;
-    sph_sha512_context       sha512;
-#endif
    sph_keccak512_context    keccak;
    sph_whirlpool_context    whirlpool;
    sph_haval256_5_context   haval;
@@ -135,13 +129,8 @@ m7m_ctx_holder m7m_ctx;

 void init_m7m_ctx()
 {
-#ifndef USE_SPH_SHA
    SHA256_Init( &m7m_ctx.sha256 );
    SHA512_Init( &m7m_ctx.sha512 );
-#else
-    sph_sha256_init( &m7m_ctx.sha256 );
-    sph_sha512_init( &m7m_ctx.sha512 );
-#endif
    sph_keccak512_init( &m7m_ctx.keccak );
    sph_whirlpool_init( &m7m_ctx.whirlpool );
    sph_haval256_5_init( &m7m_ctx.haval );
@@ -176,28 +165,18 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
    memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-#ifndef USE_SPH_SHA
    SHA256_CTX         ctxf_sha256;
-#else
-    sph_sha256_context ctxf_sha256;
-#endif

    memcpy(data, pdata, 80);

-#ifndef USE_SPH_SHA
    SHA256_Update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
    SHA512_Update(  &ctx1.sha512,    data, M7_MIDSTATE_LEN );
-#else
-    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
-    sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
-#endif
    sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
    sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
    sph_haval256_5( &ctx1.haval,     data, M7_MIDSTATE_LEN );
    sph_tiger(      &ctx1.tiger,     data, M7_MIDSTATE_LEN );
    sph_ripemd160(  &ctx1.ripemd,    data, M7_MIDSTATE_LEN );

-// the following calculations can be performed once and the results shared
    mpz_t magipi, magisw, product, bns0, bns1;
    mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
    
@@ -222,22 +201,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );

-// with 4 way can a single midstate be shared among lanes?
-// do sinlge round of midstate and inyerleave for final
-
-#ifndef USE_SPH_SHA
        SHA256_Update(  &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );

        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
-#else
-        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
-        sph_sha256_close( &ctx2.sha256, (void*)(bhash[0]) );
-
-        sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
-        sph_sha512_close( &ctx2.sha512, (void*)(bhash[1]) );
-#endif
        sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );

@@ -253,7 +221,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

-// 4 way serial
 	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
 	mpz_set(product, bns0);
@@ -269,17 +236,10 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        bytes = mpz_sizeinbase(product, 256);
        mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);

-#ifndef USE_SPH_SHA
        SHA256_Init( &ctxf_sha256 );
        SHA256_Update(  &ctxf_sha256, bdata, bytes );
        SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
-#else
-        sph_sha256_init( &ctxf_sha256 );
-        sph_sha256( &ctxf_sha256, bdata, bytes );
-        sph_sha256_close( &ctxf_sha256, (void*)(hash) );
-#endif

-// do once and share
        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
 	mpf_set_prec_raw(magifpi, prec);
@@ -302,7 +262,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 	    mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-// share magipi, product and do serial			
 	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
@@ -312,18 +271,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
            mpzscale=bytes;
            mpz_export(bdata, NULL, -1, 1, 0, 0, product);

-#ifndef USE_SPH_SHA
            SHA256_Init( &ctxf_sha256 );
            SHA256_Update(  &ctxf_sha256, bdata, bytes );
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
-#else
-            sph_sha256_init( &ctxf_sha256 );
-            sph_sha256( &ctxf_sha256, bdata, bytes );
-            sph_sha256_close( &ctxf_sha256, (void*)(hash) );
-#endif
 	}

-// this is the scanhash part
 	const unsigned char *hash_ = (const unsigned char *)hash;
 	const unsigned char *target_ = (const unsigned char *)ptarget;
 	for ( i = 31; i >= 0; i-- )
@@ -354,7 +306,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

     pdata[19] = n;

-// do this in hashm7m
 out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
--- a/algo/panama/sph_panama.c
+++ b/algo/panama/sph_panama.c
@@ -0,0 +1,334 @@
+/* $Id: panama.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * PANAMA implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_panama.h"
+
+#define LVAR17(b)  sph_u32 \
+	b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
+	b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
+	b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
+
+#define LVARS   \
+	LVAR17(a) \
+	LVAR17(g) \
+	LVAR17(p) \
+	LVAR17(t)
+
+#define M17(macro)   do { \
+		macro( 0,  1,  2,  4); \
+		macro( 1,  2,  3,  5); \
+		macro( 2,  3,  4,  6); \
+		macro( 3,  4,  5,  7); \
+		macro( 4,  5,  6,  8); \
+		macro( 5,  6,  7,  9); \
+		macro( 6,  7,  8, 10); \
+		macro( 7,  8,  9, 11); \
+		macro( 8,  9, 10, 12); \
+		macro( 9, 10, 11, 13); \
+		macro(10, 11, 12, 14); \
+		macro(11, 12, 13, 15); \
+		macro(12, 13, 14, 16); \
+		macro(13, 14, 15,  0); \
+		macro(14, 15, 16,  1); \
+		macro(15, 16,  0,  2); \
+		macro(16,  0,  1,  3); \
+	} while (0)
+
+#define BUPDATE1(n0, n2)   do { \
+		sc->buffer[ptr24][n0] ^= sc->buffer[ptr31][n2]; \
+		sc->buffer[ptr31][n2] ^= INW1(n2); \
+	} while (0)
+
+#define BUPDATE   do { \
+		BUPDATE1(0, 2); \
+		BUPDATE1(1, 3); \
+		BUPDATE1(2, 4); \
+		BUPDATE1(3, 5); \
+		BUPDATE1(4, 6); \
+		BUPDATE1(5, 7); \
+		BUPDATE1(6, 0); \
+		BUPDATE1(7, 1); \
+	} while (0)
+
+#define RSTATE(n0, n1, n2, n4)    (a ## n0 = sc->state[n0])
+
+#define WSTATE(n0, n1, n2, n4)    (sc->state[n0] = a ## n0)
+
+#define GAMMA(n0, n1, n2, n4)   \
+	(g ## n0 = a ## n0 ^ (a ## n1 | SPH_T32(~a ## n2)))
+
+#define PI_ALL   do { \
+		p0  = g0; \
+		p1  = SPH_ROTL32( g7,  1); \
+		p2  = SPH_ROTL32(g14,  3); \
+		p3  = SPH_ROTL32( g4,  6); \
+		p4  = SPH_ROTL32(g11, 10); \
+		p5  = SPH_ROTL32( g1, 15); \
+		p6  = SPH_ROTL32( g8, 21); \
+		p7  = SPH_ROTL32(g15, 28); \
+		p8  = SPH_ROTL32( g5,  4); \
+		p9  = SPH_ROTL32(g12, 13); \
+		p10 = SPH_ROTL32( g2, 23); \
+		p11 = SPH_ROTL32( g9,  2); \
+		p12 = SPH_ROTL32(g16, 14); \
+		p13 = SPH_ROTL32( g6, 27); \
+		p14 = SPH_ROTL32(g13,  9); \
+		p15 = SPH_ROTL32( g3, 24); \
+		p16 = SPH_ROTL32(g10,  8); \
+	} while (0)
+
+#define THETA(n0, n1, n2, n4)   \
+	(t ## n0 = p ## n0 ^ p ## n1 ^ p ## n4)
+
+#define SIGMA_ALL   do { \
+		a0 = t0 ^ 1; \
+		a1 = t1 ^ INW2(0); \
+		a2 = t2 ^ INW2(1); \
+		a3 = t3 ^ INW2(2); \
+		a4 = t4 ^ INW2(3); \
+		a5 = t5 ^ INW2(4); \
+		a6 = t6 ^ INW2(5); \
+		a7 = t7 ^ INW2(6); \
+		a8 = t8 ^ INW2(7); \
+		a9  =  t9 ^ sc->buffer[ptr16][0]; \
+		a10 = t10 ^ sc->buffer[ptr16][1]; \
+		a11 = t11 ^ sc->buffer[ptr16][2]; \
+		a12 = t12 ^ sc->buffer[ptr16][3]; \
+		a13 = t13 ^ sc->buffer[ptr16][4]; \
+		a14 = t14 ^ sc->buffer[ptr16][5]; \
+		a15 = t15 ^ sc->buffer[ptr16][6]; \
+		a16 = t16 ^ sc->buffer[ptr16][7]; \
+	} while (0)
+
+#define PANAMA_STEP   do { \
+		unsigned ptr16, ptr24, ptr31; \
+ \
+		ptr24 = (ptr0 - 8) & 31; \
+		ptr31 = (ptr0 - 1) & 31; \
+		BUPDATE; \
+		M17(GAMMA); \
+		PI_ALL; \
+		M17(THETA); \
+		ptr16 = ptr0 ^ 16; \
+		SIGMA_ALL; \
+		ptr0 = ptr31; \
+	} while (0)
+
+/*
+ * These macros are used to compute
+ */
+#define INC0     1
+#define INC1     2
+#define INC2     3
+#define INC3     4
+#define INC4     5
+#define INC5     6
+#define INC6     7
+#define INC7     8
+
+/*
+ * Push data by blocks of 32 bytes. "pbuf" must be 32-bit aligned. Each
+ * iteration processes 32 data bytes; "num" contains the number of
+ * iterations.
+ */
+static void
+panama_push(sph_panama_context *sc, const unsigned char *pbuf, size_t num)
+{
+	LVARS
+	unsigned ptr0;
+#if SPH_LITTLE_FAST
+#define INW1(i)   sph_dec32le_aligned(pbuf + 4 * (i))
+#else
+	sph_u32 X_var[8];
+#define INW1(i)   X_var[i]
+#endif
+#define INW2(i)   INW1(i)
+
+	M17(RSTATE);
+	ptr0 = sc->buffer_ptr;
+	while (num -- > 0) {
+#if !SPH_LITTLE_FAST
+		int i;
+
+		for (i = 0; i < 8; i ++)
+			X_var[i] = sph_dec32le_aligned(pbuf + 4 * (i));
+#endif
+		PANAMA_STEP;
+		pbuf = (const unsigned char *)pbuf + 32;
+	}
+	M17(WSTATE);
+	sc->buffer_ptr = ptr0;
+
+#undef INW1
+#undef INW2
+}
+
+/*
+ * Perform the "pull" operation repeatedly ("num" times). The hash output
+ * will be extracted from the state afterwards.
+ */
+static void
+panama_pull(sph_panama_context *sc, unsigned num)
+{
+	LVARS
+	unsigned ptr0;
+#define INW1(i)     INW_H1(INC ## i)
+#define INW_H1(i)   INW_H2(i)
+#define INW_H2(i)   a ## i
+#define INW2(i)     sc->buffer[ptr4][i]
+
+	M17(RSTATE);
+	ptr0 = sc->buffer_ptr;
+	while (num -- > 0) {
+		unsigned ptr4;
+
+		ptr4 = (ptr0 + 4) & 31;
+		PANAMA_STEP;
+	}
+	M17(WSTATE);
+
+#undef INW1
+#undef INW_H1
+#undef INW_H2
+#undef INW2
+}
+
+/* see sph_panama.h */
+void
+sph_panama_init(void *cc)
+{
+	sph_panama_context *sc;
+
+	sc = cc;
+	/*
+	 * This is not completely conformant, but "it will work
+	 * everywhere". Initial state consists of zeroes everywhere.
+	 * Conceptually, the sph_u32 type may have padding bits which
+	 * must not be set to 0; but such an architecture remains to
+	 * be seen.
+	 */
+	sc->data_ptr = 0;
+	memset(sc->buffer, 0, sizeof sc->buffer);
+	sc->buffer_ptr = 0;
+	memset(sc->state, 0, sizeof sc->state);
+}
+
+#ifdef SPH_UPTR
+static void
+panama_short(void *cc, const void *data, size_t len)
+#else
+void
+sph_panama(void *cc, const void *data, size_t len)
+#endif
+{
+	sph_panama_context *sc;
+	unsigned current;
+
+	sc = cc;
+	current = sc->data_ptr;
+	while (len > 0) {
+		unsigned clen;
+
+		clen = (sizeof sc->data) - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->data + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		current += clen;
+		if (current == sizeof sc->data) {
+			current = 0;
+			panama_push(sc, sc->data, 1);
+		}
+	}
+	sc->data_ptr = current;
+}
+
+#ifdef SPH_UPTR
+/* see sph_panama.h */
+void
+sph_panama(void *cc, const void *data, size_t len)
+{
+	sph_panama_context *sc;
+	unsigned current;
+	size_t rlen;
+
+	if (len < (2 * sizeof sc->data)) {
+		panama_short(cc, data, len);
+		return;
+	}
+	sc = cc;
+	current = sc->data_ptr;
+	if (current > 0) {
+		unsigned t;
+
+		t = (sizeof sc->data) - current;
+		panama_short(sc, data, t);
+		data = (const unsigned char *)data + t;
+		len -= t;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & 3) != 0) {
+		panama_short(sc, data, len);
+		return;
+	}
+#endif
+	panama_push(sc, data, len >> 5);
+	rlen = len & 31;
+	if (rlen > 0)
+		memcpy(sc->data,
+			(const unsigned char *)data + len - rlen, rlen);
+	sc->data_ptr = rlen;
+}
+#endif
+
+/* see sph_panama.h */
+void
+sph_panama_close(void *cc, void *dst)
+{
+	sph_panama_context *sc;
+	unsigned current;
+	int i;
+
+	sc = cc;
+	current = sc->data_ptr;
+	sc->data[current ++] = 0x01;
+	memset(sc->data + current, 0, (sizeof sc->data) - current);
+	panama_push(sc, sc->data, 1);
+	panama_pull(sc, 32);
+	for (i = 0; i < 8; i ++)
+		sph_enc32le((unsigned char *)dst + 4 * i, sc->state[i + 9]);
+	sph_panama_init(sc);
+}
--- a/algo/panama/sph_panama.h
+++ b/algo/panama/sph_panama.h
@@ -0,0 +1,118 @@
+/* $Id: sph_panama.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * PANAMA interface.
+ *
+ * PANAMA has been published in: J. Daemen and C. Clapp, "Fast Hashing
+ * and Stream Encryption with PANAMA", Fast Software Encryption -
+ * FSE'98, LNCS 1372, Springer (1998), pp. 60--74.
+ *
+ * PANAMA is not fully defined with regards to endianness and related
+ * topics. This implementation follows strict little-endian conventions:
+ * <ul>
+ * <li>Each 32-byte input block is split into eight 32-bit words, the
+ * first (leftmost) word being numbered 0.</li>
+ * <li>Each such 32-bit word is decoded from memory in little-endian
+ * convention.</li>
+ * <li>The additional padding bit equal to "1" is added by considering
+ * the least significant bit in a byte to come first; practically, this
+ * means that a single byte of value 0x01 is appended to the (byte-oriented)
+ * message, and then 0 to 31 bytes of value 0x00.</li>
+ * <li>The output consists of eight 32-bit words; the word numbered 0 is
+ * written first (in leftmost position) and it is encoded in little-endian
+ * convention.
+ * </ul>
+ * With these conventions, PANAMA is sometimes known as "PANAMA-LE". The
+ * PANAMA reference implementation uses our conventions for input, but
+ * prescribes no convention for output.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_panama.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_PANAMA_H__
+#define SPH_PANAMA_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+/**
+ * Output size (in bits) for PANAMA.
+ */
+#define SPH_SIZE_panama   256
+
+/**
+ * This structure is a context for PANAMA computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a PANAMA computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running PANAMA computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char data[32];   /* first field, for alignment */
+	unsigned data_ptr;
+
+	sph_u32 buffer[32][8];
+	unsigned buffer_ptr;
+
+	sph_u32 state[17];
+#endif
+} sph_panama_context;
+
+/**
+ * Initialize a PANAMA context. This process performs no memory allocation.
+ *
+ * @param cc   the PANAMA context (pointer to a <code>sph_panama_context</code>)
+ */
+void sph_panama_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the PANAMA context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_panama(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current PANAMA computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the PANAMA context
+ * @param dst   the destination buffer
+ */
+void sph_panama_close(void *cc, void *dst);
+
+#endif
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -48,36 +48,36 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
-    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    int i;
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

-    bmw512_4way( &ctx.bmw, vhash, 80 );
+    bmw512_4way( &ctx.bmw, input, 80 );
    bmw512_4way_close( &ctx.bmw, vhash );

-    blake512_4way( &ctx.blake, input, 64 );
+    blake512_4way( &ctx.blake, vhash, 64 );
    blake512_4way_close( &ctx.blake, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-       update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                               (char*)hash0, 512 );
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                               (char*)hash1, 512 );
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                               (char*)hash2, 512 );
-       reinit_groestl( &ctx.groestl );
-       update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                            (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                            (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                            (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                            (char*)hash3, 512 );
+    mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-       skein512_4way( &ctx.skein, vhash, 64 );
-       skein512_4way_close( &ctx.skein, vhashB );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhashB );

    for ( i = 0; i < 8; i++ )
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
@@ -120,13 +120,13 @@ void anime_4way_hash( void *state, const void *input )
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
                                  m256_zero );

-       keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
-       keccak512_4way_close( &ctx.keccak, vhashA );
+    keccak512_4way_init( &ctx.keccak );
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhashA );

-       jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
-       jh512_4way_close( &ctx.jh, vhashB );
+    jh512_4way_init( &ctx.jh );
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhashB );

    for ( i = 0; i < 8; i++ )
       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
--- a/algo/radiogatun/sph_radiogatun.c
+++ b/algo/radiogatun/sph_radiogatun.c
--- a/algo/radiogatun/sph_radiogatun.h
+++ b/algo/radiogatun/sph_radiogatun.h
@@ -0,0 +1,186 @@
+/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
+/**
+ * RadioGatun interface.
+ *
+ * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
+ * and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
+ * presented at the Second Cryptographic Hash Workshop, Santa Barbara,
+ * August 24-25, 2006. The main Web site, containing that article, the
+ * reference code and some test vectors, appears to be currently located
+ * at the following URL: http://radiogatun.noekeon.org/
+ *
+ * The presentation article does not specify endianness or padding. The
+ * reference code uses the following conventions, which we also apply
+ * here:
+ * <ul>
+ * <li>The input message is an integral number of sequences of three
+ * words. Each word is either a 32-bit of 64-bit word (depending on
+ * the version of RadioGatun).</li>
+ * <li>Input bytes are decoded into words using little-endian
+ * convention.</li>
+ * <li>Padding consists of a single bit of value 1, using little-endian
+ * convention within bytes (i.e. for a byte-oriented input, a single
+ * byte of value 0x01 is appended), then enough bits of value 0 to finish
+ * the current block.</li>
+ * <li>Output consists of 256 bits. Successive output words are encoded
+ * with little-endian convention.</li>
+ * </ul>
+ * These conventions are very close to those we use for PANAMA, which is
+ * a close ancestor or RadioGatun.
+ *
+ * RadioGatun is actually a family of functions, depending on some
+ * internal parameters. We implement here two functions, with a "belt
+ * length" of 13, a "belt width" of 3, and a "mill length" of 19. The
+ * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
+ * variant uses 64-bit words.
+ *
+ * Strictly speaking, the name "RadioGatun" should use an acute accent
+ * on the "u", which we omitted here to keep strict ASCII-compatibility
+ * of this file.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_radiogatun.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_RADIOGATUN_H__
+#define SPH_RADIOGATUN_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+/**
+ * Output size (in bits) for RadioGatun[32].
+ */
+#define SPH_SIZE_radiogatun32   256
+
+/**
+ * This structure is a context for RadioGatun[32] computations: it
+ * contains intermediate values and some data from the last entered
+ * block. Once a RadioGatun[32] computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running RadioGatun[32]
+ * computation can be cloned by copying the context (e.g. with a
+ * simple <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char data[156];   /* first field, for alignment */
+	unsigned data_ptr;
+	sph_u32 a[19], b[39];
+#endif
+} sph_radiogatun32_context;
+
+/**
+ * Initialize a RadioGatun[32] context. This process performs no
+ * memory allocation.
+ *
+ * @param cc   the RadioGatun[32] context (pointer to a
+ *             <code>sph_radiogatun32_context</code>)
+ */
+void sph_radiogatun32_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RadioGatun[32] context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_radiogatun32(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RadioGatun[32] computation and output the
+ * result into the provided buffer. The destination buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is
+ * automatically reinitialized.
+ *
+ * @param cc    the RadioGatun[32] context
+ * @param dst   the destination buffer
+ */
+void sph_radiogatun32_close(void *cc, void *dst);
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for RadioGatun[64].
+ */
+#define SPH_SIZE_radiogatun64   256
+
+/**
+ * This structure is a context for RadioGatun[64] computations: it
+ * contains intermediate values and some data from the last entered
+ * block. Once a RadioGatun[64] computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running RadioGatun[64]
+ * computation can be cloned by copying the context (e.g. with a
+ * simple <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char data[312];   /* first field, for alignment */
+	unsigned data_ptr;
+	sph_u64 a[19], b[39];
+#endif
+} sph_radiogatun64_context;
+
+/**
+ * Initialize a RadioGatun[64] context. This process performs no
+ * memory allocation.
+ *
+ * @param cc   the RadioGatun[64] context (pointer to a
+ *             <code>sph_radiogatun64_context</code>)
+ */
+void sph_radiogatun64_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RadioGatun[64] context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_radiogatun64(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RadioGatun[64] computation and output the
+ * result into the provided buffer. The destination buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is
+ * automatically reinitialized.
+ *
+ * @param cc    the RadioGatun[64] context
+ * @param dst   the destination buffer
+ */
+void sph_radiogatun64_close(void *cc, void *dst);
+
+#endif
+
+#endif
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,10 +4,13 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+// Overide multi way on ryzen, SHA is better.
+#if !defined(RYZEN_)
 // need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
 #if defined(__AVX2__)
  #define LBRY_8WAY
 #endif
+#endif

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -4,24 +4,17 @@
 #include <string.h>
 #include <stdio.h>
 #include "sph_ripemd.h"
-#include "algo/sha/sph_sha2.h"
 #include <openssl/sha.h>

 void lbry_hash(void* output, const void* input)
 {
-#ifndef USE_SPH_SHA
   SHA256_CTX              ctx_sha256 __attribute__ ((aligned (64)));
   SHA512_CTX              ctx_sha512 __attribute__ ((aligned (64)));
-#else
-   sph_sha256_context      ctx_sha256 __attribute__ ((aligned (64)));
-   sph_sha512_context      ctx_sha512 __attribute__ ((aligned (64)));
-#endif
   sph_ripemd160_context   ctx_ripemd __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) hashA[16];
   uint32_t _ALIGN(64) hashB[16];
   uint32_t _ALIGN(64) hashC[16];

-#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, input, 112 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
@@ -33,19 +26,6 @@ void lbry_hash(void* output, const void* input)
   SHA512_Init( &ctx_sha512 );
   SHA512_Update( &ctx_sha512, hashA, 32 );
   SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
-#else
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, input, 112 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha512_init( &ctx_sha512 );
-   sph_sha512 ( &ctx_sha512, hashA, 32 );
-   sph_sha512_close( &ctx_sha512, hashA );
-#endif

   sph_ripemd160_init( &ctx_ripemd );
   sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
@@ -55,7 +35,6 @@ void lbry_hash(void* output, const void* input)
   sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
   sph_ripemd160_close( &ctx_ripemd, hashC );

-#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashB, 20 );
   SHA256_Update( &ctx_sha256, hashC, 20 );
@@ -64,16 +43,7 @@ void lbry_hash(void* output, const void* input)
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashA, 32 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
-#else
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashB, 20 );
-   sph_sha256 ( &ctx_sha256, hashC, 20 );
-   sph_sha256_close( &ctx_sha256, hashA );

-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
-#endif
   memcpy( output, hashA, 32 );
 }

--- a/algo/scryptjane/scrypt-jane-portable-x86.h
+++ b/algo/scryptjane/scrypt-jane-portable-x86.h
@@ -296,6 +296,7 @@ get_xgetbv(uint32_t flags) {
 size_t cpu_detect_mask = (size_t)-1;
 #endif

+#if 0
 static size_t
 detect_cpu(void) {
 	union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
@@ -354,6 +355,7 @@ detect_cpu(void) {

 	return cpu_flags;
 }
+#endif

 #if defined(SCRYPT_TEST_SPEED)
 static const char *
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -28,8 +28,8 @@ void sha256t_8way_hash( void* output, const void* input )

 }

-int scanhash_sha256t_8way( int thr_id, struct work *work,
-                           uint32_t max_nonce, uint64_t *hashes_done )
+int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -39,9 +39,8 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152;   // 19*8
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   const uint64_t htmax[] = {          0,
                                     0xF,
@@ -56,27 +55,25 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
                               0xFFFF0000,
                                        0 };

-   for ( int k = 0; k < 20; k++ )
-      be32enc( &edata[k], pdata[k] );
+   // Need big endian data
+   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
                                 edata, edata, edata, edata, 640 );
   sha256_8way_init( &sha256_ctx8 );
   sha256_8way( &sha256_ctx8, vdata, 64 );
-        
+
   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
      uint32_t mask = masks[m];
-      do {
-         be32enc( noncep,    n   );
-         be32enc( noncep +1, n+1 );
-         be32enc( noncep +2, n+2 );
-         be32enc( noncep +3, n+3 );
-         be32enc( noncep +4, n+4 );
-         be32enc( noncep +5, n+5 );
-         be32enc( noncep +6, n+6 );
-         be32enc( noncep +7, n+7 );
-         pdata[19] = n;
+      do
+      {
+        *noncev = mm256_bswap_32(
+		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+
+	 pdata[19] = n;

         sha256t_8way_hash( hash, vdata );

@@ -91,20 +88,24 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,

 	    if ( fulltest( lane_hash, ptarget ) )
            {
-	       pdata[19] = n + lane;
-               nonces[ num_found++ ] = n + lane;
-               work_set_target_ratio( work, lane_hash );
+	      pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
 	    }
 	 }
         n += 8;

-      } while ( (num_found == 0) && (n < max_nonce)
-                && !work_restart[thr_id].restart );
+      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
    
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #elif defined(SHA256T_4WAY)
@@ -130,8 +131,8 @@ void sha256t_4way_hash( void* output, const void* input )

 }

-int scanhash_sha256t_4way( int thr_id, struct work *work,
-                           uint32_t max_nonce, uint64_t *hashes_done )
+int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -143,9 +144,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76;   // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   const uint64_t htmax[] = {          0,
                                     0xF,
@@ -160,8 +160,11 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
                               0xFFFF0000,
                                        0 };

-   for ( int k = 0; k < 19; k++ )
-      be32enc( &edata[k], pdata[k] );
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   sha256_4way_init( &sha256_ctx4 );
@@ -171,11 +174,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
   {
      uint32_t mask = masks[m];
      do {
-         be32enc( noncep,    n   );
-         be32enc( noncep +1, n+1 );
-         be32enc( noncep +2, n+2 );
-         be32enc( noncep +3, n+3 );
-         pdata[19] = n;
+         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+	 pdata[19] = n;

         sha256t_4way_hash( hash, vdata );

@@ -186,21 +186,25 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,

            if ( fulltest( lane_hash, ptarget ) )
            {
-               pdata[19] = n + lane;
-               nonces[ num_found++ ] = n + lane;
-               work_set_target_ratio( work, lane_hash );
+              pdata[19] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+              if ( submit_work( mythr, work ) )
+                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                             accepted_share_count + rejected_share_count + 1,
+                             thr_id, lane );
+              else
+                applog( LOG_WARNING, "Failed to submit share." );
            }
         }

 	 n += 4;

-      } while ( (num_found == 0) && (n < max_nonce)
-                && !work_restart[thr_id].restart );
+      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -4,12 +4,15 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

+// Override multi way on ryzen, SHA is better.
+#if !defined(RYZEN_)
 #if defined(__SSE4_2__)
  #define SHA256T_4WAY
 #endif
 #if defined(__AVX2__)
  #define SHA256T_8WAY
 #endif
+#endif

 bool register_blake2s_algo( algo_gate_t* gate );

@@ -17,18 +20,18 @@ bool register_blake2s_algo( algo_gate_t* gate );

 void sha256t_8way_hash( void *output, const void *input );
 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                           uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined (SHA256T_4WAY)

 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+                           uint64_t *hashes_done, struct thr_info *mythr );
 #else

 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -39,7 +39,7 @@ void sha256t_hash( void* output, const void* input )
 }

 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done)
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -52,6 +52,7 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t hash64[8] __attribute__((aligned(32)));
 #endif
   uint32_t endiandata[32];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

   uint64_t htmax[] = {
 		0,
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -248,22 +248,22 @@ do { \
 */
 #define SWAP_BC \
 do { \
-    mm128_swap256_128( B0, C0 ); \
-    mm128_swap256_128( B1, C1 ); \
-    mm128_swap256_128( B2, C2 ); \
-    mm128_swap256_128( B3, C3 ); \
-    mm128_swap256_128( B4, C4 ); \
-    mm128_swap256_128( B5, C5 ); \
-    mm128_swap256_128( B6, C6 ); \
-    mm128_swap256_128( B7, C7 ); \
-    mm128_swap256_128( B8, C8 ); \
-    mm128_swap256_128( B9, C9 ); \
-    mm128_swap256_128( BA, CA ); \
-    mm128_swap256_128( BB, CB ); \
-    mm128_swap256_128( BC, CC ); \
-    mm128_swap256_128( BD, CD ); \
-    mm128_swap256_128( BE, CE ); \
-    mm128_swap256_128( BF, CF ); \
+    mm128_swap128_256( B0, C0 ); \
+    mm128_swap128_256( B1, C1 ); \
+    mm128_swap128_256( B2, C2 ); \
+    mm128_swap128_256( B3, C3 ); \
+    mm128_swap128_256( B4, C4 ); \
+    mm128_swap128_256( B5, C5 ); \
+    mm128_swap128_256( B6, C6 ); \
+    mm128_swap128_256( B7, C7 ); \
+    mm128_swap128_256( B8, C8 ); \
+    mm128_swap128_256( B9, C9 ); \
+    mm128_swap128_256( BA, CA ); \
+    mm128_swap128_256( BB, CB ); \
+    mm128_swap128_256( BC, CC ); \
+    mm128_swap128_256( BD, CD ); \
+    mm128_swap128_256( BE, CE ); \
+    mm128_swap128_256( BF, CF ); \
 } while (0)

 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -0,0 +1,406 @@
+#include "shavite-hash-2way.h"
+#include "algo/sha/sph_types.h"
+
+#include <stdio.h>
+
+#if defined(__AVX2__)
+
+static const uint32_t IV512[] =
+{
+        0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+        0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+        0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
+};
+
+#define mm256_ror2x256hi_1x32( a, b ) \
+   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
+                       mm256_ror1x32_128( b ), 0x88 )
+
+static void
+c512_2way( shavite512_2way_context *ctx, const void *msg )
+{
+   __m256i p0, p1, p2, p3, x;
+   __m256i k00, k01, k02, k03, k10, k11, k12, k13;
+   __m256i *m = (__m256i*)msg;
+   __m256i *h = (__m256i*)ctx->h;
+   int r;
+
+   p0 = h[0];
+   p1 = h[1];
+   p2 = h[2];
+   p3 = h[3];
+
+   // round
+   k00 = m[0];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
+   k01 = m[1];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+   k02 = m[2];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+   k03 = m[3];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+   p0 = _mm256_xor_si256( p0, x );
+
+   k10 = m[4];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
+   k11 = m[5];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+   k12 = m[6];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+   k13 = m[7];
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+
+   p2 = _mm256_xor_si256( p2, x );
+
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+
+     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+                                  mm256_aesenc_2x128( k00 ) ) );
+
+     if ( r == 0 )
+        k00 = _mm256_xor_si256( k00, _mm256_set_epi32( 
+		      ~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
+                      ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
+     k01 = _mm256_xor_si256( k00,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k01 ) ) );
+
+     if ( r == 1 )
+        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
+	               ~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
+                       ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
+
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+     k02 = _mm256_xor_si256( k01,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k02 ) ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+     k03 = _mm256_xor_si256( k02,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k03 ) ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+     p3 = _mm256_xor_si256( p3, x );
+
+     k10 = _mm256_xor_si256( k03,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k10 ) ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
+     k11 = _mm256_xor_si256( k10,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k11 ) ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+     k12 = _mm256_xor_si256( k11,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+     k13 = _mm256_xor_si256( k12,
+		     mm256_ror1x32_128( mm256_aesenc_2x128( k13 ) ) );
+
+     if ( r == 2 )
+        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
+                  ~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
+                  ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+ 
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+     p1 = _mm256_xor_si256( p1, x );
+     
+     // round 2, 6, 10
+
+     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ) );
+     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+     p2 = _mm256_xor_si256( p2, x );
+
+     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ) );
+     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+
+     p0 = _mm256_xor_si256( p0, x );
+
+     // round 3, 7, 11
+
+     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k00 ) ), k13 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ) );
+     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k01 ) ), k00 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k02 ) ), k01 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k03 ) ), k02 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+     p1 = _mm256_xor_si256( p1, x );
+
+     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k10 ) ), k03 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ) );
+     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k11 ) ), k10 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k12 ) ), k11 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+                                     mm256_aesenc_2x128( k13 ) ), k12 );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+
+     p3 = _mm256_xor_si256( p3, x );
+
+     // round 4, 8, 12
+
+     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
+     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+     p0 = _mm256_xor_si256( p0, x );
+
+     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
+     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+
+     p2 = _mm256_xor_si256( p2, x );
+
+   }
+
+   // round 13
+
+   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k00 ) ), k13  );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
+   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k01 ) ), k00 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
+   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k02 ) ), k01 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
+   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k03 ) ), k02 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
+
+   p3 = _mm256_xor_si256( p3, x );
+
+   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k10 ) ), k03 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
+   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k11 ) ), k10 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
+
+   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) );
+   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
+	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
+	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
+
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
+   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+			             mm256_aesenc_2x128( k13 ) ), k12 );
+   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
+
+   p1 = _mm256_xor_si256( p1, x );
+
+   h[0] = _mm256_xor_si256( h[0], p2 );
+   h[1] = _mm256_xor_si256( h[1], p3 );
+   h[2] = _mm256_xor_si256( h[2], p0 );
+   h[3] = _mm256_xor_si256( h[3], p1 );
+}
+
+void shavite512_2way_init( shavite512_2way_context *ctx )
+{
+   casti_m256i( ctx->h, 0 ) =
+            _mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0],
+                              IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );  
+   casti_m256i( ctx->h, 1 ) =
+            _mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4],
+                              IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] );
+   casti_m256i( ctx->h, 2 ) =
+            _mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
+                              IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
+   casti_m256i( ctx->h, 3 ) =
+            _mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
+                              IV512[15], IV512[14], IV512[13], IV512[12] );
+   ctx->ptr    = 0;
+   ctx->count0 = 0;
+   ctx->count1 = 0;
+   ctx->count2 = 0;
+   ctx->count3 = 0;
+}
+
+void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
+                             size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 1 )
+         clen = len << 1;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= clen >> 1;
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_2way( ctx, buf );
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+}
+
+void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
+{
+    unsigned char *buf;
+    union 
+    {
+       uint32_t u32[4];
+       uint16_t u16[8];
+    } count;
+
+    buf = ctx->buf;
+    uint32_t vp = ctx->ptr>>5;
+
+    // Terminating byte then zero pad
+    casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+
+    // Zero pad full vectors up to count
+    for ( ; vp < 6; vp++ )      
+        casti_m256i( buf, vp ) = m256_zero;
+
+    // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+    // Count is misaligned to 16 bits and straddles a vector.
+    // Use u32 overlay to stage then u16 to load buf.
+    count.u32[0] = ctx->count0 += (ctx->ptr << 2);  // ptr/2 * 8
+    count.u32[1] = ctx->count1;
+    count.u32[2] = ctx->count2;
+    count.u32[3] = ctx->count3;
+
+    casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
+		                              count.u16[0], 0,0,0,0,0,0,0 );
+    casti_m256i( buf, 7 ) = _mm256_set_epi16(
+		    0x0200      , count.u16[7], count.u16[6], count.u16[5],
+		    count.u16[4], count.u16[3], count.u16[2], count.u16[1],
+                    0x0200      , count.u16[7], count.u16[6], count.u16[5],
+                    count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
+
+    c512_2way( ctx, buf);
+
+    casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
+    casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
+    casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
+    casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
+}
+
+void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
+                                   const void *data, size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   // process full blocks and load buf with remainder.
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 1 )
+         clen = len << 1;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= clen >> 1;
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_2way( ctx, buf );
+         ptr = 0;
+      }
+   }
+
+   uint32_t vp = ptr>>5;
+
+   // Terminating byte then zero pad
+   casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
+
+   // Zero pad full vectors up to count
+   for ( ; vp < 6; vp++ )
+       casti_m256i( buf, vp ) = m256_zero;
+
+   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+   // Count is misaligned to 16 bits and straddles a vector.
+   // Use u32 overlay to stage then u16 to load buf.
+   union
+   {
+      uint32_t u32[4];
+      uint16_t u16[8];
+   } count;
+
+   count.u32[0] = ctx->count0 += (ptr << 2);  // ptr/2 * 8
+   count.u32[1] = ctx->count1;
+   count.u32[2] = ctx->count2;
+   count.u32[3] = ctx->count3;
+
+   casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
+                                             count.u16[0], 0,0,0,0,0,0,0 );
+   casti_m256i( buf, 7 ) = _mm256_set_epi16(
+                   0x0200      , count.u16[7], count.u16[6], count.u16[5],
+                   count.u16[4], count.u16[3], count.u16[2], count.u16[1],
+                   0x0200      , count.u16[7], count.u16[6], count.u16[5],
+                   count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
+
+   c512_2way( ctx, buf);
+
+   casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
+   casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
+   casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
+   casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
+}
+
+#endif // AVX2
--- a/algo/shavite/shavite-hash-2way.h
+++ b/algo/shavite/shavite-hash-2way.h
@@ -0,0 +1,25 @@
+#ifndef SHAVITE_HASH_2WAY_H__
+#define SHAVITE_HASH_2WAY_H__
+
+#if defined(__AVX2__)
+  
+#include "avxdefs.h"
+
+typedef struct {
+        unsigned char buf[128<<1];
+        uint32_t h[16<<1];
+        size_t ptr;
+        uint32_t count0, count1, count2, count3;
+} shavite512_2way_context __attribute__ ((aligned (64)));
+
+void shavite512_2way_init( shavite512_2way_context *ctx );
+void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
+	                     size_t len );
+void shavite512_2way_close( shavite512_2way_context *ctx, void *dst );
+void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
+		                   const void *data, size_t len );
+
+#endif // AVX2
+
+#endif // SHAVITE_HASH_2WAY_H__
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -102,35 +102,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, m128_zero );
-  
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, m128_zero );
-
   k02 = m[2];
   x = _mm_xor_si128( x, k02 );
   x = _mm_aesenc_si128( x, m128_zero );
-
   k03 = m[3];
   x = _mm_xor_si128( x, k03 );
   x = _mm_aesenc_si128( x, m128_zero );
+
   p0 = _mm_xor_si128( p0, x );

   k10 = m[4];
   x = _mm_xor_si128( p3, k10 );
   x = _mm_aesenc_si128( x, m128_zero );
-   
   k11 = m[5];
   x = _mm_xor_si128( x, k11 );
   x = _mm_aesenc_si128( x, m128_zero );
-
   k12 = m[6];
   x = _mm_xor_si128( x, k12 );
   x = _mm_aesenc_si128( x, m128_zero );
-
   k13 = m[7];
   x = _mm_xor_si128( x, k13 );
   x = _mm_aesenc_si128( x, m128_zero );
+
   p2 = _mm_xor_si128( p2, x );

   for ( r = 0; r < 3; r ++ )
@@ -156,15 +152,15 @@ c512( sph_shavite_big_context *sc, const void *msg )
      x = _mm_aesenc_si128( x, m128_zero );
      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );
-
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );
-
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
+
      p3 = _mm_xor_si128( p3, x );
+
      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

@@ -172,12 +168,10 @@ c512( sph_shavite_big_context *sc, const void *msg )
      x = _mm_aesenc_si128( x, m128_zero );
      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );
-
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );
-
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
@@ -196,118 +190,103 @@ c512( sph_shavite_big_context *sc, const void *msg )
      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );

      p2 = _mm_xor_si128( p2, x );
+
      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
+
      p0 = _mm_xor_si128( p0, x );

      // round 3, 7, 11

      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );
-
      x = _mm_xor_si128( p2, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-
      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );
-
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );
-
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );
-
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
+
      p1 = _mm_xor_si128( p1, x );
+
      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );
-
      x = _mm_xor_si128( p0, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );
-
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );
-
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );
-
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
+
      p3 = _mm_xor_si128( p3, x );

      // round 4, 8, 12

      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
-
      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
-
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
-
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
-
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
-      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );

+      p0 = _mm_xor_si128( p0, x );
+
+      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
-
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
-
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
-
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
+
      p2 = _mm_xor_si128( p2, x );
   }

@@ -315,46 +294,41 @@ c512( sph_shavite_big_context *sc, const void *msg )

   k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );
-
   x = _mm_xor_si128( p0, k00 );
   x = _mm_aesenc_si128( x, m128_zero );
   k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );
-
   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, m128_zero );
   k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );
-
   x = _mm_xor_si128( x, k02 );
   x = _mm_aesenc_si128( x, m128_zero );
   k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );
-
   x = _mm_xor_si128( x, k03 );
   x = _mm_aesenc_si128( x, m128_zero );
+
   p3 = _mm_xor_si128( p3, x );
+
   k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );
-
   x = _mm_xor_si128( p2, k10 );
   x = _mm_aesenc_si128( x, m128_zero );
   k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );
-
   x = _mm_xor_si128( x, k11 );
   x = _mm_aesenc_si128( x, m128_zero );
   k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-
   x = _mm_xor_si128( x, k12 );
   x = _mm_aesenc_si128( x, m128_zero );
   k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );
-
   x = _mm_xor_si128( x, k13 );
   x = _mm_aesenc_si128( x, m128_zero );
+
   p1 = _mm_xor_si128( p1, x );

   h[0] = _mm_xor_si128( h[0], p2 );
@@ -427,6 +401,9 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
 	count1 = sc->count1;
 	count2 = sc->count2;
 	count3 = sc->count3;
+
+
+	
 	z = 0x80 >> n;
 	z = ((ub & -z) | z) & 0xFF;
 	if (ptr == 0 && n == 0) {
@@ -443,6 +420,7 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
 		memset(buf, 0, 110);
 		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
 	}
+
 	sph_enc32le(buf + 110, count0);
 	sph_enc32le(buf + 114, count1);
 	sph_enc32le(buf + 118, count2);
--- a/algo/shavite/sse2/shavite.c
+++ b/algo/shavite/sse2/shavite.c
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -3,9 +3,12 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

+// Override multi way on ryzen, SHA is better.
+#if !defined(RYZEN_)
 #if defined(__AVX2__)
  #define SKEIN_4WAY
 #endif
+#endif

 #if defined(SKEIN_4WAY)

--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -3,31 +3,20 @@
 #include <stdint.h>
 #include "sph_skein.h"
 #include <openssl/sha.h>
-#include "algo/sha/sph_sha2.h"

 void skeinhash(void *state, const void *input)
 {
     uint32_t hash[16] __attribute__ ((aligned (64)));
     sph_skein512_context ctx_skein;
-#ifndef USE_SPH_SHA
     SHA256_CTX           ctx_sha256;
-#else
-     sph_sha256_context   ctx_sha256;
-#endif

     sph_skein512_init( &ctx_skein );
     sph_skein512( &ctx_skein, input, 80 );
     sph_skein512_close( &ctx_skein, hash );

-#ifndef USE_SPH_SHA
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
     SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
-#else
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash, 64 );
-     sph_sha256_close( &ctx_sha256, hash );
-#endif

     memcpy(state, hash, 32);
 }
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -114,11 +114,11 @@ void x12_4way_hash( void *state, const void *input )

     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
-     cubehashReinit( &ctx.cube );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
-     cubehashReinit( &ctx.cube );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
-     cubehashReinit( &ctx.cube );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );

     // 9 Shavite
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -17,8 +17,6 @@
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-//#include "algo/fugue/sph_fugue.h"
-
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
@@ -27,45 +25,42 @@
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-
-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

 typedef struct {
-#ifdef NO_AES_NI
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#else
+#if defined(__AES__)
        hashState_groestl       groestl;
        hashState_echo          echo;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
 #endif
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
        sph_hamsi512_context    hamsi;
-//        sph_fugue512_context    fugue;
 } x12_ctx_holder;

 x12_ctx_holder x12_ctx;

 void init_x12_ctx()
 {
-#ifdef NO_AES_NI
-        sph_groestl512_init(&x12_ctx.groestl);
-        sph_echo512_init(&x12_ctx.echo);
-#else
+#if defined(__AES__)
        init_echo( &x12_ctx.echo, 512 );
        init_groestl (&x12_ctx.groestl, 64 );
+#else
+        sph_groestl512_init(&x12_ctx.groestl);
+        sph_echo512_init(&x12_ctx.echo);
 #endif
        init_luffa( &x12_ctx.luffa, 512 );
        cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x12_ctx.shavite );
        init_sd( &x12_ctx.simd, 512 );
        sph_hamsi512_init( &x12_ctx.hamsi );
-//        sph_fugue512_init( &x13_ctx.fugue );
 };

 void x12hash(void *output, const void *input)
@@ -108,12 +103,12 @@ void x12hash(void *output, const void *input)
        
        //---groetl----

-#ifdef NO_AES_NI
-        sph_groestl512 (&ctx.groestl, hash, 64);
-        sph_groestl512_close(&ctx.groestl, hash);
-#else
+#if defined(__AES__)
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
+#else
+        sph_groestl512 (&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
 #endif

        //---skein4---
@@ -153,23 +148,18 @@ void x12hash(void *output, const void *input)

        //11---echo---

-#ifdef NO_AES_NI
-        sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
-#else
+#if defined(__AES__)
        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hashB);
 #endif

        // 12 Hamsi
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);

-/*
-        // 13 Fugue
-	sph_fugue512(&ctx.fugue, hash, 64);
-	sph_fugue512_close(&ctx.fugue, hashB);
-*/
        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -183,16 +183,16 @@ void x16r_4way_hash( void* output, const void* input )
            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
-            cubehashReinit( &ctx.cube );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
                                  (const byte*)in0, size );
-            cubehashReinit( &ctx.cube );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
                                  (const byte*)in1, size );
-            cubehashReinit( &ctx.cube );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
                                  (const byte*)in2, size );
-            cubehashReinit( &ctx.cube );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
                                        (const byte*)in3, size );
         break;
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -25,7 +25,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include <openssl/sha.h>
-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
@@ -34,12 +34,12 @@ static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

 typedef struct {
-#ifdef NO_AES_NI
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#else
+#if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
 #endif
        sph_blake512_context    blake;
        sph_bmw512_context      bmw;
@@ -95,14 +95,14 @@ void x16r_hash( void* output, const void* input )
            sph_bmw512_close(&ctx.bmw, hash);
         break;
         case GROESTL:
-#ifdef NO_AES_NI
-            sph_groestl512_init( &ctx.groestl );
-            sph_groestl512( &ctx.groestl, in, size );
-            sph_groestl512_close(&ctx.groestl, hash);
-#else
+#if defined(__AES__)
            init_groestl( &ctx.groestl, 64 );
            update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)in, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in, size );
+            sph_groestl512_close(&ctx.groestl, hash);
 #endif
         break;
         case SKEIN:
@@ -141,14 +141,14 @@ void x16r_hash( void* output, const void* input )
                              (const BitSequence*)in, size<<3 );
         break;
         case ECHO:
-#ifdef NO_AES_NI
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
-#else
+#if defined(__AES__)
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash,
                                (const BitSequence*)in, size<<3 );
+#else
+             sph_echo512_init( &ctx.echo );
+             sph_echo512( &ctx.echo, in, size );
+             sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -16,10 +16,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include <openssl/sha.h>
-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -42,18 +41,14 @@ typedef struct {
  sph_fugue512_context    fugue1, fugue2;
  sph_shabal512_context   shabal1;
  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-#ifndef USE_SPH_SHA
  SHA512_CTX              sha1, sha2;
-#else
-  sph_sha512_context      sha1, sha2;
-#endif
  sph_haval256_5_context  haval1, haval2;
-#ifdef NO_AES_NI
-  sph_groestl512_context  groestl1, groestl2;
-  sph_echo512_context     echo1, echo2;
-#else
+#if defined(__AES__)
  hashState_echo          echo1, echo2;
  hashState_groestl       groestl1, groestl2;
+#else
+  sph_groestl512_context  groestl1, groestl2;
+  sph_echo512_context     echo1, echo2;
 #endif
 } hmq1725_ctx_holder;

@@ -101,26 +96,22 @@ void init_hmq1725_ctx()
    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);

-#ifndef USE_SPH_SHA
    SHA512_Init( &hmq1725_ctx.sha1 );
    SHA512_Init( &hmq1725_ctx.sha2 );
-#else
-    sph_sha512_init(&hmq1725_ctx.sha1);
-    sph_sha512_init(&hmq1725_ctx.sha2);
-#endif
+
    sph_haval256_5_init(&hmq1725_ctx.haval1);
    sph_haval256_5_init(&hmq1725_ctx.haval2);

-#ifdef NO_AES_NI
-     sph_groestl512_init( &hmq1725_ctx.groestl1 );
-     sph_groestl512_init( &hmq1725_ctx.groestl2 );
-     sph_echo512_init( &hmq1725_ctx.echo1 );
-     sph_echo512_init( &hmq1725_ctx.echo2 );
-#else
+#if defined(__AES__)
     init_echo( &hmq1725_ctx.echo1, 512 );
     init_echo( &hmq1725_ctx.echo2, 512 );
     init_groestl( &hmq1725_ctx.groestl1, 64 );
     init_groestl( &hmq1725_ctx.groestl2, 64 );
+#else
+     sph_groestl512_init( &hmq1725_ctx.groestl1 );
+     sph_groestl512_init( &hmq1725_ctx.groestl2 );
+     sph_echo512_init( &hmq1725_ctx.echo1 );
+     sph_echo512_init( &hmq1725_ctx.echo2 );
 #endif
 }

@@ -151,12 +142,12 @@ extern void hmq1725hash(void *state, const void *input)

    if ( hashB[0] & mask )   //1
    {
-#ifdef NO_AES_NI
+#if defined(__AES__)
+     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
+                               (const char*)hashB, 512 );
+#else
     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
-#else
-     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA, 
-                               (const char*)hashB, 512 );
 #endif
    }
    else
@@ -217,12 +208,12 @@ extern void hmq1725hash(void *state, const void *input)
 	memset(&hashB[8], 0, 32);
    }

-#ifdef NO_AES_NI
-    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
-    sph_echo512_close(&h_ctx.echo1, hashA); //6
-#else
+#if defined(__AES__)
    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
                        (const BitSequence *)hashB, 512 );
+#else
+    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
+    sph_echo512_close(&h_ctx.echo1, hashA); //6
 #endif

    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
@@ -247,12 +238,12 @@ extern void hmq1725hash(void *state, const void *input)

    if ( hashA[0] & mask ) //4
    {
-#ifdef NO_AES_NI
-     sph_echo512 (&h_ctx.echo2, hashA, 64); //
-     sph_echo512_close(&h_ctx.echo2, hashB); //5
-#else
+#if defined(__AES__)
     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
                         (const BitSequence *)hashA, 512 );
+#else
+     sph_echo512 (&h_ctx.echo2, hashA, 64); //
+     sph_echo512_close(&h_ctx.echo2, hashB); //5
 #endif
    }
    else
@@ -274,30 +265,20 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
-#ifndef USE_SPH_SHA
        SHA512_Update( &h_ctx.sha1, hashB, 64 );
        SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
-#else
-        sph_sha512 (&h_ctx.sha1, hashB, 64); //7
-        sph_sha512_close(&h_ctx.sha1, hashA); //8
-#endif
    }

-#ifdef NO_AES_NI
-    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
-    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
-#else
+#if defined(__AES__)
    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
                               (const char*)hashA, 512 );
+#else
+    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
+    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
 #endif

-#ifndef USE_SPH_SHA
    SHA512_Update( &h_ctx.sha2, hashB, 64 );
    SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
-#else
-    sph_sha512 (&h_ctx.sha2, hashB, 64); //2 
-    sph_sha512_close(&h_ctx.sha2, hashA); //3 
-#endif

    if ( hashA[0] & mask ) //4
    {
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -0,0 +1,872 @@
+#include "sonoa-gate.h"
+
+#if defined(SONOA_4WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/haval-hash-4way.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cube_2way_context       cube;
+    shavite512_2way_context shavite;
+    simd_2way_context       simd;
+    hashState_echo          echo;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+} sonoa_4way_ctx_holder;
+
+sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
+
+void init_sonoa_4way_ctx()
+{
+     blake512_4way_init( &sonoa_4way_ctx.blake );
+     bmw512_4way_init( &sonoa_4way_ctx.bmw );
+     init_groestl( &sonoa_4way_ctx.groestl, 64 );
+     skein512_4way_init( &sonoa_4way_ctx.skein );
+     jh512_4way_init( &sonoa_4way_ctx.jh );
+     keccak512_4way_init( &sonoa_4way_ctx.keccak );
+     luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
+     cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
+     shavite512_2way_init( &sonoa_4way_ctx.shavite );
+     simd_2way_init( &sonoa_4way_ctx.simd, 512 );
+     init_echo( &sonoa_4way_ctx.echo, 512 );
+     hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
+     sph_fugue512_init( &sonoa_4way_ctx.fugue );
+     shabal512_4way_init( &sonoa_4way_ctx.shabal );
+     sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
+     sha512_4way_init( &sonoa_4way_ctx.sha512 );
+     haval256_5_4way_init( &sonoa_4way_ctx.haval );
+};
+
+void sonoa_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+     sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+        memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
+
+// 1
+
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+// 2
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+// 3
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+// 4
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+
+     mm256_reinterleave_4x64( vhashB, vhash, 512 ); 
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+// 5
+     mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+
+     shabal512_4way_init( &ctx.shabal );
+     shabal512_4way( &ctx.shabal, vhashB, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     shabal512_4way_init( &ctx.shabal );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// 6
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     shabal512_4way_init( &ctx.shabal );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_close( &ctx.sha512, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// 7
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way_init( &ctx.skein );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_close( &ctx.hamsi, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     shabal512_4way_init( &ctx.shabal );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha512_4way_init( &ctx.sha512 );
+     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_close( &ctx.sha512, vhash );
+
+     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+
+     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_close( &ctx.haval, state );
+
+}
+
+int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
+	            uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[7<<2]);
+     uint32_t lane_hash[8];
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     int num_found = 0;
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+//     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     const uint32_t Htarg = ptarget[7];
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // Need big endian data
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+           *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+                             _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
+                                                *noncev );
+           sonoa_4way_hash( hash, vdata );
+
+           for ( int lane = 0; lane < 4; lane++ )
+           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
+           {
+              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+              if ( fulltest( lane_hash, ptarget ) )
+              {
+                 pdata[19] = n + lane;
+                 nonces[ num_found++ ] = n + lane;
+                 work_set_target_ratio( work, lane_hash );
+              }
+           }
+           n += 4;
+        } while ( ( num_found == 0 ) && ( n < max_nonce )
+                  && !work_restart[thr_id].restart );
+        break;
+     }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -0,0 +1,18 @@
+#include "sonoa-gate.h"
+
+bool register_sonoa_algo( algo_gate_t* gate )
+{
+#if defined (SONOA_4WAY)
+  init_sonoa_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_sonoa_4way;
+  gate->hash      = (void*)&sonoa_4way_hash;
+#else
+  init_sonoa_ctx();
+  gate->scanhash  = (void*)&scanhash_sonoa;
+  gate->hash      = (void*)&sonoa_hash;
+#endif
+  gate->get_max64     = (void*)&get_max64_0x1ffff;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -0,0 +1,32 @@
+#ifndef SONOA_GATE_H__
+#define SONOA_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY
+#endif
+
+bool register_sonoa_algo( algo_gate_t* gate );
+
+#if defined(SONOA_4WAY)
+
+void sonoa_4way_hash( void *state, const void *input );
+
+int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+void init_sonoa_4way_ctx();
+
+#endif
+
+void sonoa_hash( void *state, const void *input );
+
+int scanhash_sonoa( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+void init_sonoa_ctx();
+
+#endif
+
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -0,0 +1,648 @@
+#include "sonoa-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/sph-haval.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/blake/sse2/blake.c"
+#include "algo/bmw/sse2/bmw.c"
+#include "algo/keccak/sse2/keccak.c"
+#include "algo/skein/sse2/skein.c"
+#include "algo/jh/sse2/jh_sse2_opt64.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif
+
+typedef struct {
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
+        sph_echo512_context     echo;
+#endif
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        sph_skein512_context    skein;
+        hashState_luffa         luffa;
+        cubehashParam           cubehash;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_haval256_5_context  haval;
+} sonoa_ctx_holder;
+
+sonoa_ctx_holder sonoa_ctx __attribute__ ((aligned (64)));
+
+void init_sonoa_ctx()
+{
+        sph_blake512_init( &sonoa_ctx.blake);
+        sph_bmw512_init( &sonoa_ctx.bmw);
+#if defined(__AES__)
+        init_echo( &sonoa_ctx.echo, 512 );
+        init_groestl( &sonoa_ctx.groestl, 64 );
+#else
+        sph_groestl512_init(&sonoa_ctx.groestl );
+        sph_echo512_init( &sonoa_ctx.echo );
+#endif
+        sph_skein512_init( &sonoa_ctx.skein);
+        sph_jh512_init( &sonoa_ctx.jh);
+        sph_keccak512_init( &sonoa_ctx.keccak );
+        init_luffa( &sonoa_ctx.luffa, 512 );
+        cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
+        sph_shavite512_init( &sonoa_ctx.shavite );
+        init_sd( &sonoa_ctx.simd, 512 );
+        sph_hamsi512_init( &sonoa_ctx.hamsi );
+        sph_fugue512_init( &sonoa_ctx.fugue );
+        sph_shabal512_init( &sonoa_ctx.shabal );
+        sph_whirlpool_init( &sonoa_ctx.whirlpool );
+        SHA512_Init( &sonoa_ctx.sha512 );
+        sph_haval256_5_init(&sonoa_ctx.haval);
+};
+
+void sonoa_hash( void *state, const void *input )
+{
+	uint8_t hash[128] __attribute__ ((aligned (64)));
+        sonoa_ctx_holder ctx __attribute__ ((aligned (64)));
+        memcpy( &ctx, &sonoa_ctx, sizeof(sonoa_ctx) );
+
+        sph_blake512(&ctx.blake, input, 80);
+	sph_blake512_close(&ctx.blake, hash);
+
+	sph_bmw512(&ctx.bmw, hash, 64);
+	sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+	sph_skein512(&ctx.skein, hash, 64);
+	sph_skein512_close(&ctx.skein, hash);
+
+	sph_jh512(&ctx.jh, hash, 64);
+	sph_jh512_close(&ctx.jh, hash);
+
+	sph_keccak512(&ctx.keccak, hash, 64);
+	sph_keccak512_close(&ctx.keccak, hash);
+
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+	sph_shavite512(&ctx.shavite, hash, 64);
+	sph_shavite512_close(&ctx.shavite, hash);
+
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+//
+
+        sph_bmw512_init( &ctx.bmw);
+        sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+	
+//
+
+        sph_bmw512_init( &ctx.bmw);
+	sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+//
+
+        sph_bmw512_init( &ctx.bmw);
+        sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512_init( &ctx.fugue );
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+        sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close(&ctx.shabal, hash);
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+//
+
+        sph_bmw512_init( &ctx.bmw);
+        sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+        sph_shabal512_init( &ctx.shabal );
+	sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close(&ctx.shabal, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512_init( &ctx.fugue );
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+        sph_shabal512_init( &ctx.shabal );
+        sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close(&ctx.shabal, hash);
+
+        sph_whirlpool(&ctx.whirlpool, hash, 64);
+        sph_whirlpool_close(&ctx.whirlpool, hash);
+
+//
+        sph_bmw512_init( &ctx.bmw);
+        sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512_init( &ctx.fugue );
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+        sph_shabal512_init( &ctx.shabal );
+        sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close(&ctx.shabal, hash);
+
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool(&ctx.whirlpool, hash, 64);
+        sph_whirlpool_close(&ctx.whirlpool, hash);
+
+        SHA512_Update( &ctx.sha512, hash, 64 );
+        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool(&ctx.whirlpool, hash, 64);
+        sph_whirlpool_close(&ctx.whirlpool, hash);
+
+//
+
+        sph_bmw512_init( &ctx.bmw);
+        sph_bmw512(&ctx.bmw, hash, 64);
+        sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+        init_groestl( &ctx.groestl, 64 );
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+        sph_groestl512_init(&ctx.groestl );
+        sph_groestl512(&ctx.groestl, hash, 64);
+        sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+        sph_skein512_init( &ctx.skein);
+        sph_skein512(&ctx.skein, hash, 64);
+        sph_skein512_close(&ctx.skein, hash);
+
+        sph_jh512_init( &ctx.jh);
+        sph_jh512(&ctx.jh, hash, 64);
+        sph_jh512_close(&ctx.jh, hash);
+
+        sph_keccak512_init( &ctx.keccak );
+        sph_keccak512(&ctx.keccak, hash, 64);
+        sph_keccak512_close(&ctx.keccak, hash);
+
+        init_luffa( &ctx.luffa, 512 );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+        cubehashInit( &ctx.cubehash, 512, 16, 32 );
+        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
+                              (const byte*)hash, 64 );
+
+        sph_shavite512_init( &ctx.shavite );
+        sph_shavite512(&ctx.shavite, hash, 64);
+        sph_shavite512_close(&ctx.shavite, hash);
+
+        init_sd( &ctx.simd, 512 );
+        update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, 512 );
+
+#if defined(__AES__)
+        init_echo( &ctx.echo, 512 );
+        update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512_init( &ctx.echo );
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
+#endif
+
+        sph_hamsi512_init( &ctx.hamsi );
+        sph_hamsi512(&ctx.hamsi, hash, 64);
+        sph_hamsi512_close(&ctx.hamsi, hash);
+
+        sph_fugue512_init( &ctx.fugue );
+        sph_fugue512(&ctx.fugue, hash, 64);
+        sph_fugue512_close(&ctx.fugue, hash);
+
+        sph_shabal512_init( &ctx.shabal );
+        sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close(&ctx.shabal, hash);
+
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool(&ctx.whirlpool, hash, 64);
+        sph_whirlpool_close(&ctx.whirlpool, hash);
+
+        SHA512_Init( &ctx.sha512 );
+        SHA512_Update( &ctx.sha512, hash, 64 );
+        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+
+        sph_haval256_5(&ctx.haval,(const void*) hash, 64);
+        sph_haval256_5_close(&ctx.haval, hash);
+
+   memcpy(state, hash, 32);
+}
+
+int scanhash_sonoa( int thr_id, struct work *work, uint32_t max_nonce,
+	            uint64_t *hashes_done, struct thr_info *mythr )
+{
+uint32_t _ALIGN(128) hash32[8];
+uint32_t _ALIGN(128) endiandata[20];
+uint32_t *pdata = work->data;
+uint32_t *ptarget = work->target;
+const uint32_t first_nonce = pdata[19];
+const uint32_t Htarg = ptarget[7];
+uint32_t n = pdata[19] - 1;
+/* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+
+   uint64_t htmax[] =
+   {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+   };
+   uint32_t masks[] =
+   {
+	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+   };
+
+
+   // we need bigendian data...
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+#ifdef DEBUG_ALGO
+   printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+   for ( int m = 0; m < 6; m++ )
+   {
+	if ( Htarg <= htmax[m] )
+       	{
+	   uint32_t mask = masks[m];
+	   do
+           {
+	     pdata[19] = ++n;
+	     be32enc(&endiandata[19], n);
+	     sonoa_hash(hash32, endiandata);
+#ifndef DEBUG_ALGO
+	     if ( ( !( hash32[7] & mask ) ) && fulltest( hash32, ptarget ) )
+	     {
+		work_set_target_ratio( work, hash32 );
+		*hashes_done = n - first_nonce + 1;
+		return 1;
+	     }
+#else
+  	     if (!(n % 0x1000) && !thr_id) printf(".");
+	     if ( !(hash32[7] & mask) )
+       	     {
+               printf("[%d]",thr_id);
+               if ( fulltest( hash32, ptarget ) )
+	       {
+	         work_set_target_ratio( work, hash32 );
+	         *hashes_done = n - first_nonce + 1;
+	         return 1;
+	       }
+             }
+#endif
+	   } while (n < max_nonce && !work_restart[thr_id].restart);
+	// see blake.c if else to understand the loop on htmax => mask
+	break;
+	}
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
+}
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -15,6 +15,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -24,7 +25,9 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha2-hash-4way.h"

-typedef struct {
+//typedef struct {
+union _x17_4way_context_overlay
+{
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
@@ -33,7 +36,7 @@ typedef struct {
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
-    sph_shavite512_context  shavite;
+    shavite512_2way_context shavite;
    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
@@ -42,8 +45,10 @@ typedef struct {
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
-} x17_4way_ctx_holder;
+};  
+typedef union _x17_4way_context_overlay x17_4way_context_overlay;

+/*
 x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));

 void init_x17_4way_ctx()
@@ -56,16 +61,17 @@ void init_x17_4way_ctx()
     keccak512_4way_init( &x17_4way_ctx.keccak );
     luffa_2way_init( &x17_4way_ctx.luffa, 512 );
     cube_2way_init( &x17_4way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x17_4way_ctx.shavite );
+     shavite512_2way_init( &x17_4way_ctx.shavite );
     simd_2way_init( &x17_4way_ctx.simd, 512 );
     init_echo( &x17_4way_ctx.echo, 512 );
     hamsi512_4way_init( &x17_4way_ctx.hamsi );
     sph_fugue512_init( &x17_4way_ctx.fugue );
     shabal512_4way_init( &x17_4way_ctx.shabal );
+     sph_whirlpool_init( &x17_4way_ctx.whirlpool );
     sha512_4way_init( &x17_4way_ctx.sha512 );
     haval256_5_4way_init( &x17_4way_ctx.haval );
 };
-
+*/
 void x17_4way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -73,155 +79,159 @@ void x17_4way_hash( void *state, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     x17_4way_ctx_holder ctx;
-     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
+     x17_4way_context_overlay ctx;
+//     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );

-     // 1 Blake 4 way 64 bit
+     // 1 Blake parallel 4 way 64 bit
+     blake512_4way_init( &ctx.blake );
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

     // 2 Bmw
+     bmw512_4way_init( &ctx.bmw );
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

-     // Serial
+     // Serialize
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // Parallel 4way
+     // Parallellize
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-     // 4 Skein
+     // 4 Skein parallel 4 way 64 bit 
+     skein512_4way_init( &ctx.skein );
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

     // 5 JH
+     jh512_4way_init( &ctx.jh );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

     // 6 Keccak
+     keccak512_4way_init( &ctx.keccak );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     // 7 Luffa  parallel 2 way 128 bit
+     mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     // 7 Luffa  parallel 2 way
-     mm256_interleave_2x128( vhash,  hash0, hash1, 512 );
-     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_2way_init( &ctx.luffa, 512 );
     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash parallel 2 way
-     cube_2way_update_close( &ctx.cube, vhash, vhash, 64 );
-     cube_2way_reinit( &ctx.cube );
+     // 8 Cubehash
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );

-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     // 9 Shavite
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     // 10 Simd
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );

-     // 9 Shavite serial
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
-             sizeof(sph_shavite512_context) );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-
-     // 10 Simd parallel 2 way 128 bit
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-     simd_2way_init( &ctx.simd, 512 );
-     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
-
     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );     
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );     
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     init_echo( &ctx.echo, 512 );     
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

     // 12 Hamsi parallel 4 way 64 bit
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
+
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 13 Fugue serial
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal, parallel 4 way 32 bit SSE
+     // 14 Shabal, parallel 4 way 32 bit
     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
+
     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
-     // 15 Whirlpool
+     // 15 Whirlpool serial
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
-             sizeof(sph_whirlpool_context) );
+     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

     // 16 SHA512 parallel 64 bit 
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha512_4way_init( &ctx.sha512 );
     sha512_4way( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );     

     // 17 Haval parallel 32 bit
     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+
+     haval256_5_4way_init( &ctx.haval );
     haval256_5_4way( &ctx.haval, vhashB, 64 );
     haval256_5_4way_close( &ctx.haval, state );

 }

 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[7<<2]);
@@ -234,49 +244,48 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     int num_found = 0;
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
+     // Need big endian data
+     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
-         do
-         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+  	   *noncev = mm256_interleave_blend_32( mm256_bswap_32(
+	                     _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
+	  		                        *noncev );
+           x17_4way_hash( hash, vdata );

-            x17_4way_hash( hash, vdata );
-
-            for ( int lane = 0; lane < 4; lane++ )
-            if ( ( ( hash7[ lane ] & mask ) == 0 ) )
-            {
-               mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
-               if ( fulltest( lane_hash, ptarget ) )
-               {
-                  pdata[19] = n + lane;
-                  nonces[ num_found++ ] = n + lane;
-                  work_set_target_ratio( work, lane_hash );
-               }
-            }
-	    n += 4;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
+	   for ( int lane = 0; lane < 4; lane++ )
+           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
+           {
+              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+              if ( fulltest( lane_hash, ptarget ) )
+              {
+                 pdata[19] = n + lane;
+                 nonces[ num_found++ ] = n + lane;
+                 work_set_target_ratio( work, lane_hash );
+              }
+           }
+           n += 4;
+        } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
-         break;
-       }
+        break;
+     }

     *hashes_done = n - first_nonce + 1;
     return num_found;
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -3,10 +3,12 @@
 bool register_x17_algo( algo_gate_t* gate )
 {
 #if defined (X17_4WAY)
-  init_x17_4way_ctx();
+printf("register x17 4way\n");
+//  init_x17_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
+printf("register x17 no 4way\n");
  init_x17_ctx();
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -15,16 +15,16 @@ bool register_x17_algo( algo_gate_t* gate );
 void x17_4way_hash( void *state, const void *input );

 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+                       uint64_t *hashes_done, struct thr_info *mythr );

-void init_x17_4way_ctx();
+//void init_x17_4way_ctx();

 #endif

 void x17_hash( void *state, const void *input );

 int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done );
+                  uint64_t *hashes_done, struct thr_info *mythr );

 void init_x17_ctx();

--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -5,7 +5,6 @@
 #include <stdio.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
@@ -13,14 +12,11 @@
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
-
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
@@ -30,18 +26,21 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 #include <openssl/sha.h>
-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
 #endif

 typedef struct {
-#ifdef NO_AES_NI
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#else
-        hashState_echo          echo;
+#if defined(__AES__)
        hashState_groestl       groestl;
+        hashState_echo          echo;
+#else
+        sph_groestl512_context  groestl;
+        sph_echo512_context     echo;
 #endif
        hashState_luffa         luffa;
        cubehashParam           cubehash;
@@ -51,11 +50,7 @@ typedef struct {
        sph_fugue512_context    fugue;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
-#ifndef USE_SPH_SHA
        SHA512_CTX              sha512;
-#else
-        sph_sha512_context      sha512;
-#endif
        sph_haval256_5_context  haval;
 } x17_ctx_holder;

@@ -63,12 +58,12 @@ x17_ctx_holder x17_ctx __attribute__ ((aligned (64)));

 void init_x17_ctx()
 {
-#ifdef NO_AES_NI
+#if defined(__AES__)
+        init_groestl( &x17_ctx.groestl, 64 );
+        init_echo( &x17_ctx.echo, 512 );
+#else
        sph_groestl512_init(&x17_ctx.groestl );
        sph_echo512_init(&x17_ctx.echo);
-#else
-        init_echo( &x17_ctx.echo, 512 );
-        init_groestl( &x17_ctx.groestl, 64 );
 #endif
        init_luffa( &x17_ctx.luffa, 512 );
        cubehashInit( &x17_ctx.cubehash, 512, 16, 32 );
@@ -78,11 +73,7 @@ void init_x17_ctx()
        sph_fugue512_init( &x17_ctx.fugue );
        sph_shabal512_init( &x17_ctx.shabal );
        sph_whirlpool_init( &x17_ctx.whirlpool );
-#ifndef USE_SPH_SHA
        SHA512_Init( &x17_ctx.sha512 );
-#else
-        sph_sha512_init(&x17_ctx.sha512);
-#endif
        sph_haval256_5_init(&x17_ctx.haval);
 };

@@ -123,12 +114,12 @@ void x17_hash(void *output, const void *input)

        //---groestl----

-#ifdef NO_AES_NI
-        sph_groestl512(&ctx.groestl, hash, 64);
-        sph_groestl512_close(&ctx.groestl, hash);
-#else
+#if defined(__AES__)
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
+#else
+        sph_groestl512( &ctx.groestl, hash, 64 );
+        sph_groestl512_close( &ctx.groestl, hash );
 #endif

        //---skein4---
@@ -151,134 +142,136 @@ void x17_hash(void *output, const void *input)
        KEC_C;

        //--- luffa7
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );

        // 8 Cube
        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
-                              (const byte*)hashB, 64 );
+                              (const byte*)hash, 64 );

        // 9 Shavite
        sph_shavite512( &ctx.shavite, hash, 64);
-        sph_shavite512_close( &ctx.shavite, hashB);
+        sph_shavite512_close( &ctx.shavite, hash);

        // 10 Simd
-        update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hashB, 512 );
+        update_final_sd( &ctx.simd, (BitSequence*)hash,
+                         (const BitSequence*)hash, 512 );

        //11---echo---
-#ifdef NO_AES_NI
-        sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
+#if defined(__AES__)
+        update_final_echo ( &ctx.echo, (BitSequence*)hash,
+                            (const BitSequence*)hash, 512 );
 #else
-        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
-                            (const BitSequence *)hash, 512 );
+	sph_echo512( &ctx.echo, hash, 64 );
+        sph_echo512_close( &ctx.echo, hash );
 #endif

        // X13 algos
        // 12 Hamsi
-        sph_hamsi512(&ctx.hamsi, hashB, 64);
-        sph_hamsi512_close(&ctx.hamsi, hash);
+        sph_hamsi512( &ctx.hamsi, hash, 64 );
+        sph_hamsi512_close( &ctx.hamsi, hash );

        // 13 Fugue
-         sph_fugue512(&ctx.fugue, hash, 64);
-        sph_fugue512_close(&ctx.fugue, hashB);
+        sph_fugue512(&ctx.fugue, hash, 64 );
+        sph_fugue512_close(&ctx.fugue, hash );

        // X14 Shabal
-        sph_shabal512(&ctx.shabal, hashB, 64);
-        sph_shabal512_close(&ctx.shabal, hash);
+        sph_shabal512(&ctx.shabal, hash, 64);
+        sph_shabal512_close( &ctx.shabal, hash );
       
        // X15 Whirlpool
-	sph_whirlpool(&ctx.whirlpool, hash, 64);
-	sph_whirlpool_close(&ctx.whirlpool, hashB);
+	sph_whirlpool( &ctx.whirlpool, hash, 64 );
+	sph_whirlpool_close( &ctx.whirlpool, hash );

-#ifndef USE_SPH_SHA
-        SHA512_Update( &ctx.sha512, hashB, 64 );
-        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-#else
-        sph_sha512(&ctx.sha512,(const void*) hashB, 64);
-        sph_sha512_close(&ctx.sha512,(void*) hash);
-#endif
-        sph_haval256_5(&ctx.haval,(const void*) hash, 64);
-        sph_haval256_5_close(&ctx.haval,hashB);
+	SHA512_Update( &ctx.sha512, hash, 64 );
+        SHA512_Final( (unsigned char*)hash, &ctx.sha512 );

-
-        asm volatile ("emms");
-	memcpy(output, hashB, 32);
+        sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
+        sph_haval256_5_close( &ctx.haval, output );
 }

-int scanhash_x17(int thr_id, struct work *work,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
+	          uint64_t *hashes_done, struct thr_info *mythr)
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t hash64[8] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated

-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
+   uint64_t htmax[] =
+   {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+   };
+   uint32_t masks[] =
+   {
+	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+   };

-	// we need bigendian data...
-        swab32_array( endiandata, pdata, 20 );
+   // we need bigendian data...
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

 #ifdef DEBUG_ALGO
-	if (Htarg != 0)
-		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+   if ( Htarg != 0 )
+	printf( "[%d] Htarg=%X\n", thr_id, Htarg );
 #endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[19] = ++n;
-				be32enc(&endiandata[19], n);
-				x17_hash(hash64, endiandata);
+   for ( int m = 0; m < 6; m++ )
+   {
+	if ( Htarg <= htmax[m] )
+	{
+	   uint32_t mask = masks[m];
+	   do
+	   {
+		pdata[19] = ++n;
+		be32enc( &endiandata[19], n );
+		x17_hash( hash64, endiandata );
 #ifndef DEBUG_ALGO
-				if (!(hash64[7] & mask))
-                                {
-                                  if ( fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-                                    }
-//                                    else
-//                                    {
-//                                      applog(LOG_INFO, "Result does not validate on CPU!");
-//                                     }
-                         	}
+		if ( !( hash64[7] & mask ) )
+                {
+                   if ( fulltest( hash64, ptarget ) )
+		   {
+		      *hashes_done = n - first_nonce + 1;
+		      return true;
+                   }
+//                 else
+//                      applog(LOG_INFO, "Result does not validate on CPU!");
+                }
 #else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-                                                work_set_target_ratio( work, hash64 );
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
+		if ( !( n % 0x1000 ) && !thr_id ) printf(".");
+		if ( !( hash64[7] & mask ) )
+	       	{
+		   printf("[%d]",thr_id);
+		   if ( fulltest( hash64, ptarget ) )
+		   {
+                       work_set_target_ratio( work, hash64 );
+		       *hashes_done = n - first_nonce + 1;
+		       return true;
+		   }
 		}
+#endif
+	    } while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+	    break;
 	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   }
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
 }
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -16,17 +16,16 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/simd/nist.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include <openssl/sha.h>
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-#else
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
 #endif

 typedef struct {
@@ -43,18 +42,14 @@ typedef struct {
        sph_fugue512_context    fugue;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
-#ifndef USE_SPH_SHA
        SHA512_CTX              sha512;
-#else
-        sph_sha512_context      sha512;
-#endif
        sph_haval256_5_context  haval;
-#ifdef NO_AES_NI
-        sph_groestl512_context  groestl;
-        sph_echo512_context     echo;
-#else
+#if defined(__AES__)
        hashState_echo          echo;
        hashState_groestl       groestl;
+#else
+	sph_groestl512_context  groestl;
+        sph_echo512_context     echo;
 #endif
 } xevan_ctx_holder;

@@ -77,18 +72,14 @@ void init_xevan_ctx()
        sph_fugue512_init( &xevan_ctx.fugue );
        sph_shabal512_init( &xevan_ctx.shabal );
        sph_whirlpool_init( &xevan_ctx.whirlpool );
-#ifndef USE_SPH_SHA
        SHA512_Init( &xevan_ctx.sha512 );
-#else
-        sph_sha512_init(&xevan_ctx.sha512);
-#endif
        sph_haval256_5_init(&xevan_ctx.haval);
-#ifdef NO_AES_NI
-        sph_groestl512_init( &xevan_ctx.groestl );
-        sph_echo512_init( &xevan_ctx.echo );
-#else
+#if defined(__AES__)
        init_groestl( &xevan_ctx.groestl, 64 );
        init_echo( &xevan_ctx.echo, 512 );
+#else
+	sph_groestl512_init( &xevan_ctx.groestl );
+        sph_echo512_init( &xevan_ctx.echo );
 #endif
 };

@@ -117,12 +108,12 @@ void xevan_hash(void *output, const void *input)
 	sph_bmw512(&ctx.bmw, hash, dataLen);
 	sph_bmw512_close(&ctx.bmw, hash);

-#ifdef NO_AES_NI
+#if defined(__AES__)
+        update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, dataLen*8 );
+#else
 	sph_groestl512(&ctx.groestl, hash, dataLen);
 	sph_groestl512_close(&ctx.groestl, hash);
-#else
-        update_and_final_groestl( &ctx.groestl, (char*)hash, 
-                                  (const char*)hash, dataLen*8 );
 #endif

 	sph_skein512(&ctx.skein, hash, dataLen);
@@ -146,12 +137,12 @@ void xevan_hash(void *output, const void *input)
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, dataLen*8 );

-#ifdef NO_AES_NI
+#if defined(__AES__)
+        update_final_echo( &ctx.echo, (BitSequence *) hash,
+                           (const BitSequence *) hash, dataLen*8 );
+#else
 	sph_echo512(&ctx.echo, hash, dataLen);
 	sph_echo512_close(&ctx.echo, hash);
-#else
-        update_final_echo( &ctx.echo, (BitSequence *) hash, 
-                           (const BitSequence *) hash, dataLen*8 );
 #endif

 	sph_hamsi512(&ctx.hamsi, hash, dataLen);
@@ -166,13 +157,9 @@ void xevan_hash(void *output, const void *input)
 	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
 	sph_whirlpool_close(&ctx.whirlpool, hash);

-#ifndef USE_SPH_SHA
        SHA512_Update( &ctx.sha512, hash, dataLen );
        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-#else
-	sph_sha512(&ctx.sha512,(const void*) hash, dataLen);
-	sph_sha512_close(&ctx.sha512,(void*) hash);
-#endif
+
 	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
 	sph_haval256_5_close(&ctx.haval, hash);

@@ -186,12 +173,12 @@ void xevan_hash(void *output, const void *input)
 	sph_bmw512(&ctx.bmw, hash, dataLen);
 	sph_bmw512_close(&ctx.bmw, hash);

-#ifdef NO_AES_NI
-        sph_groestl512(&ctx.groestl, hash, dataLen);
-        sph_groestl512_close(&ctx.groestl, hash);
-#else
+#if defined(__AES__)
        update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const BitSequence*)hash, dataLen*8 );
+#else
+	sph_groestl512(&ctx.groestl, hash, dataLen);
+        sph_groestl512_close(&ctx.groestl, hash);
 #endif

 	sph_skein512(&ctx.skein, hash, dataLen);
@@ -214,12 +201,12 @@ void xevan_hash(void *output, const void *input)
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, dataLen*8 );

-#ifdef NO_AES_NI
-        sph_echo512(&ctx.echo, hash, dataLen);
-        sph_echo512_close(&ctx.echo, hash);
-#else
+#if defined(__AES__)
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                           (const BitSequence *) hash, dataLen*8 );
+#else
+        sph_echo512(&ctx.echo, hash, dataLen);
+        sph_echo512_close(&ctx.echo, hash);
 #endif

 	sph_hamsi512(&ctx.hamsi, hash, dataLen);
@@ -234,13 +221,9 @@ void xevan_hash(void *output, const void *input)
 	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
 	sph_whirlpool_close(&ctx.whirlpool, hash);

-#ifndef USE_SPH_SHA
        SHA512_Update( &ctx.sha512, hash, dataLen );
        SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-#else
-	sph_sha512(&ctx.sha512,(const void*) hash, dataLen);
-	sph_sha512_close(&ctx.sha512,(void*) hash);
-#endif
+
 	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
 	sph_haval256_5_close(&ctx.haval, hash);

--- a/algo/x20/x20r-gate.c
+++ b/algo/x20/x20r-gate.c
@@ -0,0 +1,34 @@
+#include "x20r-gate.h"
+
+void getAlgoString( const uint8_t* prevblock, char *output )
+{
+    char *sptr = outpuit;
+
+    for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
+    {
+        char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
+        uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
+        if (algoDigit >= 10)
+            sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+         else
+            sprintf(sptr, "%u", (uint32_t) algoDigit);
+        sptr++;
+     }
+     *sptr = '\0';
+}
+
+bool register_x20r_algo( algo_gate_t* gate )
+{
+#if defined (X20R_4WAY)
+  gate->scanhash  = (void*)&scanhash_x20r_4way;
+  gate->hash      = (void*)&x20r_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_x20r;
+  gate->hash      = (void*)&x20r_hash;
+#endif
+  gate->set_target = (void*)&alt_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
+  return true;
+};
+
--- a/algo/x20/x20r-gate.h
+++ b/algo/x20/x20r-gate.h
@@ -0,0 +1,58 @@
+#ifndef X20R_GATE_H__
+#define X20R_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+/*
+#if defined(__AVX2__) && defined(__AES__)
+  #define X20R_4WAY
+#endif
+*/
+
+enum x20r_Algo {
+        BLAKE = 0,
+        BMW,
+        GROESTL,
+        JH,
+        KECCAK,
+        SKEIN,
+        LUFFA,
+        CUBEHASH,
+        SHAVITE,
+        SIMD,
+        ECHO,
+        HAMSI,
+        FUGUE,
+        SHABAL,
+        WHIRLPOOL,
+        SHA_512,
+        HAVAL,      // 256-bits output
+        GOST,
+        RADIOGATUN, // 256-bits output
+        PANAMA,     // 256-bits output
+        X20R_HASH_FUNC_COUNT
+};
+
+void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
+
+void x20r_getAlgoString( const uint8_t* prevblock, char *output );
+
+bool register_xi20r_algo( algo_gate_t* gate );
+
+#if defined(X20R_4WAY)
+
+void x20r_4way_hash( void *state, const void *input );
+
+int scanhash_x20r_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+#endif
+
+void x20rhash( void *state, const void *input );
+
+int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+#endif
+
--- a/algo/x20/x20r.c
+++ b/algo/x20/x20r.c
@@ -0,0 +1,275 @@
+#include "x20r-gate.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/sph-haval.h"
+#include "algo/radiogatun/sph_radiogatun.h"
+#include "algo/panama/sph_panama.h"
+#include "algo/gost/sph_gost.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif 
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
+
+union _x20r_context_overlay
+{
+    sph_blake512_context     blake;
+    sph_bmw512_context       bmw;
+#if defined(__AES__)
+    hashState_groestl        groestl;
+    hashState_echo           echo;
+#else
+    sph_groestl512_context   groestl;
+    sph_echo512_context      echo;
+#endif
+    sph_skein512_context     skein;
+    sph_jh512_context        jh;
+    sph_keccak512_context    keccak;
+    hashState_luffa          luffa;
+    cubehashParam            cube;
+    hashState_sd             simd;
+    sph_shavite512_context   shavite;
+    sph_hamsi512_context     hamsi;
+    sph_fugue512_context     fugue;
+    sph_shabal512_context    shabal;
+    sph_whirlpool_context    whirlpool;
+    SHA512_CTX               sha512;
+    sph_haval256_5_context   haval;
+    sph_gost512_context      gost;
+    sph_radiogatun64_context radiogatun;
+    sph_panama_context       panama;
+};
+typedef union _x20r_context_overlay x20r_context_overlay;
+
+void x20r_hash(void* output, const void* input)
+{
+   uint32_t _ALIGN(128) hash[64/4];
+   x20r_context_overlay ctx;
+/*
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa;
+	sph_cubehash512_context  ctx_cubehash;
+	sph_shavite512_context   ctx_shavite;
+	sph_simd512_context      ctx_simd;
+	sph_echo512_context      ctx_echo;
+	sph_hamsi512_context     ctx_hamsi;
+	sph_fugue512_context     ctx_fugue;
+	sph_shabal512_context    ctx_shabal;
+	sph_whirlpool_context    ctx_whirlpool;
+	sph_sha512_context       ctx_sha512;
+	sph_haval256_5_context   ctx_haval;
+	sph_gost512_context      ctx_gost;
+	sph_radiogatun64_context ctx_radiogatun;
+	sph_panama_context       ctx_panama;
+*/
+   void *in = (void*) input;
+   int size = 80;
+
+   if ( s_ntime == UINT32_MAX )
+   {
+	const uint8_t* in8 = (uint8_t*) input;
+	x20_r_s_getAlgoString(&in8[4], hashOrder);
+   }
+
+   for (int i = 0; i < 20; i++)
+   {
+	const char elem = hashOrder[i];
+	const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+	switch ( algo )
+       	{
+	   case BLAKE:
+		sph_blake512_init(&ctx.blake);
+		sph_blake512(&ctx.blake, in, size);
+		sph_blake512_close(&ctx.blake, hash);
+		break;
+	   case BMW:
+		sph_bmw512_init(&ctx.bmw);
+		sph_bmw512(&ctx.bmw, in, size);
+		sph_bmw512_close(&ctx.bmw, hash);
+		break;
+	   case GROESTL:
+#if defined(__AES__)
+                init_groestl( &ctx.groestl, 64 );
+                update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                         (const char*)in, size<<3 );
+#else
+                sph_groestl512_init(&ctx.groestl);
+                sph_groestl512(&ctx.groestl, in, size);
+                sph_groestl512_close(&ctx.groestl, hash);
+#endif
+                break;
+           case SKEIN:
+		sph_skein512_init(&ctx.skein);
+		sph_skein512(&ctx.skein, in, size);
+		sph_skein512_close(&ctx.skein, hash);
+		break;
+	   case JH:
+		sph_jh512_init(&ctx.jh);
+		sph_jh512(&ctx.jh, in, size);
+		sph_jh512_close(&ctx.jh, hash);
+		break;
+	   case KECCAK:
+		sph_keccak512_init(&ctx.keccak);
+		sph_keccak512(&ctx.keccak, in, size);
+		sph_keccak512_close(&ctx.keccak, hash);
+		break;
+	   case LUFFA:
+                init_luffa( &ctx.luffa, 512 );
+                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                        (const BitSequence*)in, size );
+		break;
+           case CUBEHASH:
+                cubehashInit( &ctx.cube, 512, 16, 32 );
+                cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                      (const byte*)in, size );
+		break;
+	   case SHAVITE:
+		sph_shavite512_init(&ctx.shavite);
+		sph_shavite512(&ctx.shavite, in, size);
+		sph_shavite512_close(&ctx.shavite, hash);
+		break;
+           case SIMD:
+                init_sd( &ctx.simd, 512 );
+                update_final_sd( &ctx.simd, (BitSequence *)hash,
+                                 (const BitSequence *)in, size<<3 );
+			break;
+           case ECHO:
+#if defined(__AES__)
+                init_echo( &ctx.echo, 512 );
+                update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                    (const BitSequence *)in, size<<3 );
+#else
+	        sph_echo512_init(&ctx.echo);
+	        sph_echo512(&ctx.echo, in, size);
+	        sph_echo512_close(&ctx.echo, hash);
+#endif
+		break;
+	   case HAMSI:
+		sph_hamsi512_init(&ctx.hamsi);
+		sph_hamsi512(&ctx.hamsi, in, size);
+		sph_hamsi512_close(&ctx.hamsi, hash);
+		break;
+	   case FUGUE:
+		sph_fugue512_init(&ctx.fugue);
+		sph_fugue512(&ctx.fugue, in, size);
+		sph_fugue512_close(&ctx.fugue, hash);
+		break;
+	   case SHABAL:
+		sph_shabal512_init(&ctx.shabal);
+		sph_shabal512(&ctx.shabal, in, size);
+		sph_shabal512_close(&ctx.shabal, hash);
+		break;
+	   case WHIRLPOOL:
+		sph_whirlpool_init(&ctx.whirlpool);
+		sph_whirlpool(&ctx.whirlpool, in, size);
+		sph_whirlpool_close(&ctx.whirlpool, hash);
+		break;
+	   case SHA_512:
+                SHA512_Init( &ctx.sha512 );
+                SHA512_Update( &ctx.sha512, in, size );
+                SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+		break;
+	   case HAVAL:
+		sph_haval256_5_init(&ctx.haval);
+		sph_haval256_5(&ctx.haval, in, size);
+		sph_haval256_5_close(&ctx.haval, hash);
+		memset(&hash[8], 0, 32);
+		break;
+	   case GOST:
+		sph_gost512_init(&ctx.gost);
+		sph_gost512(&ctx.gost, in, size);
+		sph_gost512_close(&ctx.gost, hash);
+		break;
+	   case RADIOGATUN:
+		sph_radiogatun64_init(&ctx.radiogatun);
+		sph_radiogatun64(&ctx.radiogatun, in, size);
+		sph_radiogatun64_close(&ctx.radiogatun, hash);
+		memset(&hash[8], 0, 32);
+		break;
+	   case PANAMA:
+		sph_panama_init(&ctx.panama);
+		sph_panama(&ctx.panama, in, size);
+		sph_panama_close(&ctx.panama, hash);
+		memset(&hash[8], 0, 32);
+		break;
+	}
+   in = (void*) hash;
+   size = 64;
+   }
+   memcpy(output, hash, 32);
+}
+
+int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
+	           uint64_t *hashes_done )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   for (int k=0; k < 19; k++)
+	be32enc( &endiandata[k], pdata[k] );
+
+   if ( s_ntime != pdata[17] )
+   {
+	uint32_t ntime = swab32(pdata[17]);
+	x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+	s_ntime = ntime;
+	if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+   }
+
+   if ( opt_benchmark )
+	ptarget[7] = 0x0cff;
+
+   do {
+	be32enc( &endiandata[19], nonce );
+	x20r_hash( hash32, endiandata );
+
+	if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
+       	{
+           work_set_target_ratio( work, hash32 );
+	   pdata[19] = nonce;
+	   *hashes_done = pdata[19] - first_nonce;
+	   return 1;
+	}
+	nonce++;
+
+   } while (nonce < max_nonce && !(*restart));
+
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
--- a/algo/yescrypt/sha256_Y.c
+++ b/algo/yescrypt/sha256_Y.c
@@ -299,48 +299,26 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)

 	/* If Klen > 64, the key is really SHA256(K). */
 	if (Klen > 64) {
-#ifndef USE_SPH_SHA
 		SHA256_Init(&ctx->ictx);
 		SHA256_Update(&ctx->ictx, K, Klen);
 		SHA256_Final(khash, &ctx->ictx);
-#else
-                SHA256_Init_Y(&ctx->ictx);
-                SHA256_Update_Y(&ctx->ictx, K, Klen);
-                SHA256_Final_Y(khash, &ctx->ictx);
-#endif
 		K = khash;
 		Klen = 32;
 	}

 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-#ifndef USE_SPH_SHA
        SHA256_Init(&ctx->ictx);
-#else
-        SHA256_Init_Y(&ctx->ictx);
-#endif
 	memset(pad, 0x36, 64);
 	for (i = 0; i < Klen; i++)
 		pad[i] ^= K[i];
-#ifndef USE_SPH_SHA
 	SHA256_Update(&ctx->ictx, pad, 64);
-#else
-        SHA256_Update_Y(&ctx->ictx, pad, 64);
-#endif

 	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-#ifndef USE_SPH_SHA
 	SHA256_Init(&ctx->octx);
-#else
-        SHA256_Init_Y(&ctx->octx);
-#endif
 	memset(pad, 0x5c, 64);
 	for (i = 0; i < Klen; i++)
 		pad[i] ^= K[i];
-#ifndef USE_SPH_SHA
 	SHA256_Update(&ctx->octx, pad, 64);
-#else
-        SHA256_Update_Y(&ctx->octx, pad, 64);
-#endif

 	/* Clean the stack. */
 	//memset(khash, 0, 32);
@@ -352,11 +330,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
 {

 	/* Feed data to the inner SHA256 operation. */
-#ifndef USE_SPH_SHA
 	SHA256_Update(&ctx->ictx, in, len);
-#else
-        SHA256_Update_Y(&ctx->ictx, in, len);
-#endif
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -365,7 +339,6 @@ HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
 {
 	unsigned char ihash[32];

-#ifndef USE_SPH_SHA
 	/* Finish the inner SHA256 operation. */
 	SHA256_Final(ihash, &ctx->ictx);

@@ -374,16 +347,6 @@ HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)

 	/* Finish the outer SHA256 operation. */
 	SHA256_Final(digest, &ctx->octx);
-#else
-        /* Finish the inner SHA256 operation. */
-        SHA256_Final_Y(ihash, &ctx->ictx);
-
-        /* Feed the inner hash to the outer SHA256 operation. */
-        SHA256_Update_Y(&ctx->octx, ihash, 32);
-
-        /* Finish the outer SHA256 operation. */
-        SHA256_Final_Y(digest, &ctx->octx);
-#endif

 	/* Clean the stack. */
 	//memset(ihash, 0, 32);
--- a/algo/yescrypt/sha256_Y.h
+++ b/algo/yescrypt/sha256_Y.h
@@ -47,13 +47,8 @@ typedef struct HMAC_SHA256Context {
 */

 typedef struct HMAC_SHA256Context {
-#ifndef USE_SPH_SHA
        SHA256_CTX ictx;
        SHA256_CTX octx;
-#else
-        SHA256_CTX_Y ictx;
-        SHA256_CTX_Y octx;
-#endif
 } HMAC_SHA256_CTX;

 void	SHA256_Init_Y(SHA256_CTX_Y *);
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -1303,17 +1303,10 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 		S = (uint8_t *)XY + XY_size;

 	if (t || flags) {
-#ifndef USE_SPH_SHA
 		SHA256_CTX ctx;
 		SHA256_Init(&ctx);
 		SHA256_Update(&ctx, passwd, passwdlen);
 		SHA256_Final(sha256, &ctx);
-#else
-                SHA256_CTX_Y ctx;
-                SHA256_Init_Y(&ctx);
-                SHA256_Update_Y(&ctx, passwd, passwdlen);
-                SHA256_Final_Y(sha256, &ctx);
-#endif
 		passwd = sha256;
 		passwdlen = sizeof(sha256);
 	}
@@ -1372,17 +1365,10 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	   }
 	   /* Compute StoredKey */
 	   {
-#ifndef USE_SPH_SHA
 		SHA256_CTX ctx;
 		SHA256_Init(&ctx);
 		SHA256_Update(&ctx, sha256, sizeof(sha256));
 		SHA256_Final(buf, &ctx);
-#else
-                SHA256_CTX_Y ctx;
-                SHA256_Init_Y(&ctx);
-                SHA256_Update_Y(&ctx, sha256, sizeof(sha256));
-                SHA256_Final_Y(buf, &ctx);
-#endif
 	   }
 	}

--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -49,6 +49,7 @@
 * no slowdown from the prefixes is generally observed on AMD CPUs supporting
 * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
 */
+/*
 #ifdef __XOP__
 #warning "Note: XOP is enabled.  That's great."
 #elif defined(__AVX__)
@@ -60,7 +61,7 @@
 #else
 #warning "Note: building generic code for non-x86.  That's OK."
 #endif
-
+*/
 /*
 * The SSE4 code version has fewer instructions than the generic SSE2 version,
 * but all of the instructions are SIMD, thereby wasting the scalar execution