diff --git a/Makefile.am b/Makefile.am
index d70be78..17737b7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
   algo/heavy/sph_hefty1.c \
   algo/heavy/heavy.c \
   algo/heavy/bastion.c \
-  algo/hmq1725.c \
   algo/hodl/aes.c \
   algo/hodl/hodl-gate.c \
   algo/hodl/hodl-wolf.c \
@@ -110,7 +109,7 @@ cpuminer_SOURCES = \
   algo/lyra2/lyra2z330.c \
   algo/lyra2/lyra2h.c \
   algo/m7m.c \
-  algo/neoscrypt.c \
+  algo/neoscrypt/neoscrypt.c \
   algo/nist5/nist5-gate.c \
   algo/nist5/nist5-4way.c \
   algo/nist5/nist5.c \
@@ -159,16 +158,36 @@ cpuminer_SOURCES = \
   algo/whirlpool/whirlpoolx.c \
   algo/x11/x11-gate.c \
   algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
   algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
   algo/x11/c11.c \
-  algo/x11/phi1612.c \
+  algo/x11/c11-4way.c \
+  algo/x11/x11evo.c \
+  algo/x13/x13-gate.c \
   algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
   algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x14/x14-gate.c \
   algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x15/x15-gate.c \
   algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
   algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/hmq1725.c \
   algo/yescrypt/yescrypt.c \
   algo/yescrypt/sha256_Y.c\
   algo/yescrypt/yescrypt-simd.c\
diff --git a/README.md b/README.md
index 054e823..5f806e5 100644
--- a/README.md
+++ b/README.md
@@ -96,13 +96,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.
 
+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
 
-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.
 
 Errata
 ------
diff --git a/README.txt b/README.txt
index 1da75c6..dc6cf09 100644
--- a/README.txt
+++ b/README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
 
-Exe name                  Compile opts         Arch name
+Exe name                Compile flags              Arch name
 
-cpuminer-sse2.exe         -march=core2         Core2   
-cpuminer-sse42.exe        -march=corei7        Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
-cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"
+cpuminer-sse2.exe      "-march=core2"              Core2   
+cpuminer-sse42.exe     "-march=corei7"             Nehalem
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
+cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
+cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
 
 4way requires a CPU with AES and AVX2. It is still under development and
 only a few algos are supported. See change log in RELEASE_NOTES in source
 package for supported algos.
 
-There is no binary support available for SHA on AMD Ryzen CPUs.
+Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
+is provided. Four way still uses AVX2. 
+
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index e0a463f..ef8c2fc 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -27,8 +27,9 @@ Compile Instructions
 
 Requirements:
 
-Intel Core2 or newer, or AMD Steamroller or newer CPU.
-64 bit Linux or Windows operating system.
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.
 
 Building on linux prerequisites:
 
@@ -164,6 +165,10 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
 v3.7.7
 
 Fixed regression caused by 64 CPU support.
@@ -182,7 +187,7 @@ New algo keccakc for Creative coin with 4way optimizations
 Rewrote some AVX/AVX2 code for more consistent implementation and some
 optimizing.
 
-Enhanced capabilities check to support 4way, mor eprecise reporting of
+Enhanced capabilities check to support 4way, more precise reporting of
 features (not all algos use SSE2), and better error messages when using
 an incompatible pre-built version (Windows users).
 
diff --git a/algo-gate-api.c b/algo-gate-api.c
index d33ee08..71d68a9 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
      case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
      case ALGO_X11:          register_x11_algo         ( gate ); break;
      case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
      case ALGO_X13:          register_x13_algo         ( gate ); break;
      case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
      case ALGO_X14:          register_x14_algo         ( gate ); break;
diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c
index b89952c..6573385 100644
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -849,9 +849,9 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
 {
         int i;
         for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm_set1_epi32( iv[i] );
         for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm_set1_epi32( salt[i] );
 	sc->T0 = sc->T1 = 0;
 	sc->ptr = 0;
 }
@@ -941,10 +941,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 //       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
        if (out_size_w32 == 8)
            u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                    _mm_set_epi32( 0x010000000, 0x01000000,
-                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+                                        _mm_set1_epi32( 0x010000000 ) );
+       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
        blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
    }
    else
@@ -955,10 +954,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	sc->T1 = SPH_C32(0xFFFFFFFF);
 	memset_zero_128( u.buf, 56>>2 );
        if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
-                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+           u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
+        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
    }
    out = (__m128i*)dst;
diff --git a/algo/neoscrypt.c b/algo/neoscrypt/neoscrypt.c
similarity index 100%
rename from algo/neoscrypt.c
rename to algo/neoscrypt/neoscrypt.c
diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h
index 3a477dd..4cf5741 100644
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
   #define NIST5_4WAY
 #endif
 
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index d8884d9..a1134f7 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
         memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
 
         SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 
         memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
         SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 
         memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
         SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
         sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
         memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index 35380ca..326f469 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -267,9 +267,6 @@ c512(sph_shavite_big_context *sc, const void *msg)
 
 #else
 
-/*
- * This function assumes that "msg" is aligned for 32-bit access.
- */
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
@@ -379,36 +376,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 2, 6, 10
 
-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
       x = _mm_xor_si128( p3, k00 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, mm_zero );
 
       p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
       x = _mm_xor_si128( p1, k10 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, mm_zero );
 
-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, mm_zero );
       p0 = _mm_xor_si128( p0, x );
@@ -461,36 +458,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 4, 8, 12
 
-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
 
       x = _mm_xor_si128( p1, k00 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
 
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
 
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
 
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, mm_zero );
       p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
 
       x = _mm_xor_si128( p3, k10 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
 
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
 
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, mm_zero );
-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
 
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, mm_zero );
diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c
index cac9df8..6d144c1 100644
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
 
 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
+    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
     gate->scanhash  = (void*)&scanhash_skein_4way;
     gate->hash      = (void*)&skeinhash_4way;
 #else
-    gate->optimizations = SSE2_OPT | SHA_OPT;
     gate->scanhash  = (void*)&scanhash_skein;
     gate->hash      = (void*)&skeinhash;
 #endif
diff --git a/algo/tribus/tribus-4way.c b/algo/tribus/tribus-4way.c
index 166e4be..3baf27b 100644
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -10,8 +10,14 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 
+//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
 static __thread jh512_4way_context ctx_mid;
-
+/*
+void init_tribus_4way_ctx()
+{
+     init_echo( &tribus_4way_ctx, 512 );
+}
+*/
 void tribus_hash_4way(void *state, const void *input)
 {
      uint64_t hash0[8] __attribute__ ((aligned (64)));
diff --git a/algo/tribus/tribus-gate.c b/algo/tribus/tribus-gate.c
index 6545403..4804ff2 100644
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -1,22 +1,11 @@
 #include "tribus-gate.h"
-/*
-bool tribus_thread_init()
-{
-   sph_jh512_init( &tribus_ctx.jh );
-   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
-   init_echo( &tribus_ctx.echo, 512 );
-#endif
-  return true;
-}
-*/
+
 bool register_tribus_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
   gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
+//  init_tribus_4way_ctx();
   gate->scanhash      = (void*)&scanhash_tribus_4way;
   gate->hash          = (void*)&tribus_hash_4way;
 #else
diff --git a/algo/tribus/tribus-gate.h b/algo/tribus/tribus-gate.h
index aa73f7c..51cec2f 100644
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
@@ -4,12 +4,14 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
   #define TRIBUS_4WAY
 #endif
 
 #if defined(TRIBUS_4WAY)
 
+//void init_tribus_4way_ctx();
+
 void tribus_hash_4way( void *state, const void *input );
 
 int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
diff --git a/algo/whirlpool/whirlpool-gate.c b/algo/whirlpool/whirlpool-gate.c
index 847adbc..fa3e9d9 100644
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -4,6 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
 {
 #if defined (WHIRLPOOL_4WAY)
   four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
   gate->scanhash  = (void*)&scanhash_whirlpool_4way;
   gate->hash      = (void*)&whirlpool_hash_4way;
 #else
diff --git a/algo/whirlpool/whirlpool-gate.h b/algo/whirlpool/whirlpool-gate.h
index 9fab221..3d187bb 100644
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -4,9 +4,11 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
+/*
 #if defined(FOUR_WAY) && defined(__AVX2__)
   #define WHIRLPOOL_4WAY
 #endif
+*/
 
 #if defined (WHIRLPOOL_4WAY) 
 
diff --git a/algo/whirlpool/whirlpool-hash-4way.c b/algo/whirlpool/whirlpool-hash-4way.c
index 9806894..81327fa 100644
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,10 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
-#define BYTE(x, n) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+//#define BYTE(x, n) \
+//   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+
 
 // A very complex, but structured, expression with a mix of scalar
 // and vector operations to retrieve specific 64 bit constants from
@@ -3357,23 +3359,51 @@ do { \
 // Extract 64 bit vector elements from "in" representing offsets. Unmask the
 // low byte of each and scale for use as vector indexes.
 // Pack the data in a vector and return it.
+
+/*
 #define t_row( inv, row ) \
    _mm256_and_si256( \
         _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
-
-// Extract vector element from "lane" of vector "in[row]" and use it to index
-// scalar array of constants "table" and return referenced 64 bit entry.
-#define t_lane( table, inv, row, lane ) \
-   table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
-//   table[ t_rwo( inv, row )[ lane ] ];
-
+*/
 
 // Build a vector from elements of non-contiguous 64 bit data extracted from
 // scalar "table".
+// reference scalar version 1480 kH/s
+/*
+// version 1, extract with gather
+// 955 kH/s
+#define t_lane( inv, row, lane ) \
+    BYTE( _mm256_extract_epi64( inv, lane ), row ) \
+
+
 #define t_vec( table, inv, row ) \
-    _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
-                t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
-                t_lane( table, inv, row, 0 ) )
+   _mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
+                              t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
+                              t_lane( inv, row, 0) ), 1 )
+*/
+/*
+// version 2, extract with set
+// 1100 kH/s 
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+*/
+
+// version 3, vector indexing with set
+// 1105 kH/s
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( inv[ lane ], row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+
+
  
 #if SPH_SMALL_FOOTPRINT_WHIRLPOOL
 
diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c
new file mode 100644
index 0000000..76d973a
--- /dev/null
+++ b/algo/x11/c11-4way.c
@@ -0,0 +1,261 @@
+#include "cpuminer-config.h"
+#include "c11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} c11_4way_ctx_holder;
+
+c11_4way_ctx_holder c11_4way_ctx;
+
+void init_c11_4way_ctx()
+{
+     blake512_4way_init( &c11_4way_ctx.blake );
+     sph_bmw512_init( &c11_4way_ctx.bmw );
+     init_groestl( &c11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &c11_4way_ctx.skein );
+     jh512_4way_init( &c11_4way_ctx.jh );
+     keccak512_4way_init( &c11_4way_ctx.keccak );
+     init_luffa( &c11_4way_ctx.luffa, 512 );
+     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_4way_ctx.shavite );
+     init_sd( &c11_4way_ctx.simd, 512 );
+     init_echo( &c11_4way_ctx.echo, 512 );
+}
+
+void c11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     c11_4way_ctx_holder ctx;
+     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            c11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x11/c11-gate.c b/algo/x11/c11-gate.c
new file mode 100644
index 0000000..ae36c9d
--- /dev/null
+++ b/algo/x11/c11-gate.c
@@ -0,0 +1,18 @@
+#include "c11-gate.h"
+
+bool register_c11_algo( algo_gate_t* gate )
+{
+#if defined (C11_4WAY)
+  init_c11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_4way;
+  gate->hash      = (void*)&c11_4way_hash;
+#else
+  init_c11_ctx();
+  gate->scanhash  = (void*)&scanhash_c11;
+  gate->hash      = (void*)&c11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x11/c11-gate.h b/algo/x11/c11-gate.h
new file mode 100644
index 0000000..6a16123
--- /dev/null
+++ b/algo/x11/c11-gate.h
@@ -0,0 +1,32 @@
+#ifndef C11_GATE_H__
+#define C11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define C11_4WAY
+#endif
+
+bool register_c11_algo( algo_gate_t* gate );
+
+#if defined(C11_4WAY)
+
+void c11_4way_hash( void *state, const void *input );
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_c11_4way_ctx();
+
+#endif
+
+void c11_hash( void *state, const void *input );
+
+int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_c11_ctx();
+
+#endif
+
diff --git a/algo/x11/c11.c b/algo/x11/c11.c
index 34e0d69..07dc774 100644
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "c11-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
 #endif
 }
 
-void c11hash( void *output, const void *input )
+void c11_hash( void *output, const void *input )
 {
         unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
 //	uint32_t _ALIGN(64) hash[16];
@@ -157,7 +157,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	do
         {
 		be32enc( &endiandata[19], nonce );
-		c11hash( hash, endiandata );
+		c11_hash( hash, endiandata );
 		if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
                 {
 			pdata[19] = nonce;
@@ -171,13 +171,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 
-bool register_c11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_c11_ctx();
-  gate->scanhash  = (void*)&scanhash_c11;
-  gate->hash      = (void*)&c11hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
new file mode 100644
index 0000000..904d3cd
--- /dev/null
+++ b/algo/x11/x11-4way.c
@@ -0,0 +1,261 @@
+#include "cpuminer-config.h"
+#include "x11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11_4way_ctx_holder;
+
+x11_4way_ctx_holder x11_4way_ctx;
+
+void init_x11_4way_ctx()
+{
+     blake512_4way_init( &x11_4way_ctx.blake );
+     sph_bmw512_init( &x11_4way_ctx.bmw );
+     init_groestl( &x11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11_4way_ctx.skein );
+     jh512_4way_init( &x11_4way_ctx.jh );
+     keccak512_4way_init( &x11_4way_ctx.keccak );
+     init_luffa( &x11_4way_ctx.luffa, 512 );
+     cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_4way_ctx.shavite );
+     init_sd( &x11_4way_ctx.simd, 512 );
+     init_echo( &x11_4way_ctx.echo, 512 );
+}
+
+void x11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c
index 97a7527..3b3eb88 100644
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -5,13 +5,13 @@ bool register_x11_algo( algo_gate_t* gate )
 #if defined (X11_4WAY)
   init_x11_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x11_4way;
-  gate->hash      = (void*)&x11_hash_4way;
+  gate->hash      = (void*)&x11_4way_hash;
 #else
   init_x11_ctx();
   gate->scanhash  = (void*)&scanhash_x11;
   gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
   gate->get_max64 = (void*)&get_max64_0x3ffff;
   return true;
 };
diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h
index 494ef2c..a07d816 100644
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,19 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
-//  #define X11_4WAY
-//#endif
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11_4WAY
+#endif
 
 bool register_x11_algo( algo_gate_t* gate );
 
 #if defined(X11_4WAY)
 
-void x11_hash_4way( void *state, const void *input );
+void x11_4way_hash( void *state, const void *input );
 
 int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done );
 
+void init_x11_4way_ctx();
+
 #endif
 
 void x11_hash( void *state, const void *input );
diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c
new file mode 100644
index 0000000..96a86f1
--- /dev/null
+++ b/algo/x11/x11gost-4way.c
@@ -0,0 +1,268 @@
+#include "cpuminer-config.h"
+#include "x11gost-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    sph_gost512_context     gost;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11gost_4way_ctx_holder;
+
+x11gost_4way_ctx_holder x11gost_4way_ctx;
+
+void init_x11gost_4way_ctx()
+{
+     blake512_4way_init( &x11gost_4way_ctx.blake );
+     sph_bmw512_init( &x11gost_4way_ctx.bmw );
+     init_groestl( &x11gost_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11gost_4way_ctx.skein );
+     jh512_4way_init( &x11gost_4way_ctx.jh );
+     keccak512_4way_init( &x11gost_4way_ctx.keccak );
+     sph_gost512_init( &x11gost_4way_ctx.gost );
+     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_4way_ctx.shavite );
+     init_sd( &x11gost_4way_ctx.simd, 512 );
+     init_echo( &x11gost_4way_ctx.echo, 512 );
+}
+
+void x11gost_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11gost_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
+
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11gost_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x11/x11gost-gate.c b/algo/x11/x11gost-gate.c
new file mode 100644
index 0000000..acd9826
--- /dev/null
+++ b/algo/x11/x11gost-gate.c
@@ -0,0 +1,18 @@
+#include "x11gost-gate.h"
+
+bool register_x11gost_algo( algo_gate_t* gate )
+{
+#if defined (X11GOST_4WAY)
+  init_x11gost_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_4way;
+  gate->hash      = (void*)&x11gost_4way_hash;
+#else
+  init_x11gost_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost;
+  gate->hash      = (void*)&x11gost_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x11/x11gost-gate.h b/algo/x11/x11gost-gate.h
new file mode 100644
index 0000000..868d051
--- /dev/null
+++ b/algo/x11/x11gost-gate.h
@@ -0,0 +1,32 @@
+#ifndef X11GOST_GATE_H__
+#define X11GOST_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11GOST_4WAY
+#endif
+
+bool register_x11gost_algo( algo_gate_t* gate );
+
+#if defined(X11GOST_4WAY)
+
+void x11gost_4way_hash( void *state, const void *input );
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11gost_4way_ctx();
+
+#endif
+
+void x11gost_hash( void *state, const void *input );
+
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11gost_ctx();
+
+#endif
+
diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c
index 258aa80..a77424e 100644
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x11gost-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
      hashState_echo          echo;
      hashState_groestl       groestl;
 #endif
-} sib_ctx_holder;
+} x11gost_ctx_holder;
 
-sib_ctx_holder sib_ctx;
+x11gost_ctx_holder x11gost_ctx;
 
-void init_sib_ctx()
+void init_x11gost_ctx()
 {
-     sph_gost512_init(&sib_ctx.gost);
-     sph_shavite512_init(&sib_ctx.shavite);
-     init_luffa( &sib_ctx.luffa, 512 );
-     cubehashInit( &sib_ctx.cube, 512, 16, 32 );
-     init_sd( &sib_ctx.simd, 512 );
+     sph_gost512_init( &x11gost_ctx.gost );
+     sph_shavite512_init( &x11gost_ctx.shavite );
+     init_luffa( &x11gost_ctx.luffa, 512 );
+     cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
+     init_sd( &x11gost_ctx.simd, 512 );
 #ifdef NO_AES_NI
-     sph_groestl512_init( &sib_ctx.groestl );
-     sph_echo512_init( &sib_ctx.echo );
+     sph_groestl512_init( &x11gost_ctx.groestl );
+     sph_echo512_init( &x11gost_ctx.echo );
 #else
-     init_echo( &sib_ctx.echo, 512 );
-     init_groestl( &sib_ctx.groestl, 64 );
+     init_echo( &x11gost_ctx.echo, 512 );
+     init_groestl( &x11gost_ctx.groestl, 64 );
 #endif
 
 }
 
-void sibhash(void *output, const void *input)
+void x11gost_hash(void *output, const void *input)
 {
      unsigned char hash[128] __attribute__ ((aligned (64)));
      #define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
      sph_u64 hashctA;
      sph_u64 hashctB;
 
-     sib_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
+     x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
 
      DECL_BLK;
      BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
      memcpy(output, hashA, 32);
 }
 
-int scanhash_sib(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done)
 {
         uint32_t *pdata = work->data;
         uint32_t *ptarget = work->target;
@@ -156,7 +156,7 @@ int scanhash_sib(int thr_id, struct work *work,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		sibhash(hash, endiandata);
+		x11gost_hash(hash, endiandata);
 
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -172,12 +172,3 @@ int scanhash_sib(int thr_id, struct work *work,
 	return 0;
 }
 
-bool register_sib_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_sib_ctx();
-    gate->scanhash = (void*)&scanhash_sib;
-    gate->hash     = (void*)&sibhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c
new file mode 100644
index 0000000..e7493e6
--- /dev/null
+++ b/algo/x13/phi1612-4way.c
@@ -0,0 +1,186 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    cubehashParam           cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_4way_ctx_holder;
+
+phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_4way_ctx()
+{
+     skein512_4way_init( &phi1612_4way_ctx.skein );
+     jh512_4way_init( &phi1612_4way_ctx.jh );
+     cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_4way_ctx.fugue );
+     sph_gost512_init( &phi1612_4way_ctx.gost );
+     init_echo( &phi1612_4way_ctx.echo, 512 );
+};
+
+void phi1612_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     phi1612_4way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
+
+     // Skein parallel 4way
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t _ALIGN(64) endiandata[20];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+
+     for ( int k = 0; k < 19; k++ )
+        be32enc( &endiandata[k], pdata[k] );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do {
+        found[0] = found[1] = found[2] = found[3] = false;
+        be32enc( noncep0, n   );
+        be32enc( noncep1, n+1 );
+        be32enc( noncep2, n+2 );
+        be32enc( noncep3, n+3 );
+
+        phi1612_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+        }
+        if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+        {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+        }
+        if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+        {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+        }
+        if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+        {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+        }
+        n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+               && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x13/phi1612-gate.c b/algo/x13/phi1612-gate.c
new file mode 100644
index 0000000..77eae6e
--- /dev/null
+++ b/algo/x13/phi1612-gate.c
@@ -0,0 +1,18 @@
+#include "phi1612-gate.h"
+
+bool register_phi1612_algo( algo_gate_t* gate )
+{
+#if defined(PHI1612_4WAY)
+  init_phi1612_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_4way;
+  gate->hash      = (void*)&phi1612_4way_hash;
+#else
+  init_phi1612_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612;
+  gate->hash      = (void*)&phi1612_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x13/phi1612-gate.h b/algo/x13/phi1612-gate.h
new file mode 100644
index 0000000..12d2df7
--- /dev/null
+++ b/algo/x13/phi1612-gate.h
@@ -0,0 +1,32 @@
+#ifndef PHI1612_GATE_H__
+#define PHI1612_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define PHI1612_4WAY
+#endif
+
+bool register_phi1612_algo( algo_gate_t* gate );
+
+#if defined(PHI1612_4WAY)
+
+void phi1612_4way_hash( void *state, const void *input );
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done );
+
+void init_phi1612_4way_ctx();
+
+#endif
+
+void phi1612_hash( void *state, const void *input );
+
+int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_phi1612_ctx();
+
+#endif
+
diff --git a/algo/x11/phi1612.c b/algo/x13/phi1612.c
similarity index 88%
rename from algo/x11/phi1612.c
rename to algo/x13/phi1612.c
index 6913391..151fec5 100644
--- a/algo/x11/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "phi1612-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -33,7 +33,7 @@ phi_ctx_holder phi_ctx;
 static __thread sph_skein512_context phi_skein_mid
                                            __attribute__ ((aligned (64)));
 
-void init_phi_ctx()
+void init_phi1612_ctx()
 {
      sph_skein512_init( &phi_ctx.skein );
      sph_jh512_init( &phi_ctx.jh );
@@ -53,7 +53,7 @@ void phi_skein_midstate( const void* input )
     sph_skein512( &phi_skein_mid, input, 64 );
 }
 
-void phi1612hash(void *output, const void *input)
+void phi1612_hash(void *output, const void *input)
 {
      unsigned char hash[128] __attribute__ ((aligned (64)));
      phi_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -112,7 +112,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		phi1612hash(hash, endiandata);
+		phi1612_hash(hash, endiandata);
 
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -128,12 +128,3 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 
-bool register_phi1612_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_phi_ctx();
-    gate->scanhash = (void*)&scanhash_phi1612;
-    gate->hash     = (void*)&phi1612hash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c
new file mode 100644
index 0000000..4268084
--- /dev/null
+++ b/algo/x13/x13-4way.c
@@ -0,0 +1,293 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13_4way_ctx_holder;
+
+x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x13_4way_ctx()
+{
+     blake512_4way_init( &x13_4way_ctx.blake );
+     sph_bmw512_init( &x13_4way_ctx.bmw );
+     init_groestl( &x13_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13_4way_ctx.skein );
+     jh512_4way_init( &x13_4way_ctx.jh );
+     keccak512_4way_init( &x13_4way_ctx.keccak );
+     init_luffa( &x13_4way_ctx.luffa, 512 );
+     cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_4way_ctx.shavite );
+     init_sd( &x13_4way_ctx.simd, 512 );
+     init_echo( &x13_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x13_4way_ctx.hamsi );
+     sph_fugue512_init( &x13_4way_ctx.fugue );
+};
+
+void x13_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c
new file mode 100644
index 0000000..6bd4373
--- /dev/null
+++ b/algo/x13/x13-gate.c
@@ -0,0 +1,18 @@
+#include "x13-gate.h"
+
+bool register_x13_algo( algo_gate_t* gate )
+{
+#if defined (X13_4WAY)
+  init_x13_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_4way;
+  gate->hash      = (void*)&x13_4way_hash;
+#else
+  init_x13_ctx();
+  gate->scanhash  = (void*)&scanhash_x13;
+  gate->hash      = (void*)&x13hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h
new file mode 100644
index 0000000..6b71276
--- /dev/null
+++ b/algo/x13/x13-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13_GATE_H__
+#define X13_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13_4WAY
+#endif
+
+bool register_x13_algo( algo_gate_t* gate );
+
+#if defined(X13_4WAY)
+
+void x13_4way_hash( void *state, const void *input );
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13_4way_ctx();
+
+#endif
+
+void x13hash( void *state, const void *input );
+
+int scanhash_x13( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13_ctx();
+
+#endif
+
diff --git a/algo/x13/x13.c b/algo/x13/x13.c
index 237b8f7..e9acc77 100644
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -68,7 +68,7 @@ void init_x13_ctx()
         sph_fugue512_init( &x13_ctx.fugue );
 };
 
-static void x13hash(void *output, const void *input)
+void x13hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -249,15 +249,3 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
   pdata[19] = n;
   return 0;
 }
-
-
-bool register_x13_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x13_ctx();
-  gate->scanhash = (void*)&scanhash_x13;
-  gate->hash     = (void*)&x13hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c
new file mode 100644
index 0000000..a3a3990
--- /dev/null
+++ b/algo/x13/x13sm3-4way.c
@@ -0,0 +1,328 @@
+#include "x13sm3-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/sm3/sph_sm3.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sm3_ctx_t               sm3;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13sm3_4way_ctx_holder;
+
+x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context x13sm3_ctx_mid;
+
+void init_x13sm3_4way_ctx()
+{
+     blake512_4way_init( &x13sm3_4way_ctx.blake );
+     sph_bmw512_init( &x13sm3_4way_ctx.bmw );
+     init_groestl( &x13sm3_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13sm3_4way_ctx.skein );
+     jh512_4way_init( &x13sm3_4way_ctx.jh );
+     keccak512_4way_init( &x13sm3_4way_ctx.keccak );
+     init_luffa( &x13sm3_4way_ctx.luffa, 512 );
+     cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
+     init_sd( &x13sm3_4way_ctx.simd, 512 );
+     init_echo( &x13sm3_4way_ctx.echo, 512 );
+     sm3_init( &x13sm3_4way_ctx.sm3 );
+     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
+     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
+};
+
+void x13sm3_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13sm3_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
+     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+
+//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // SM3
+     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash0, 0, sizeof sm3_hash0 );
+     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash1, 0, sizeof sm3_hash1 );
+     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash2, 0, sizeof sm3_hash2 );
+     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash3, 0, sizeof sm3_hash3 );
+
+     sph_sm3( &ctx.sm3, hash0, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash0 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash1, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash1 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash2, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash2 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash3, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash3 );
+
+     // Hamsi
+     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     blake512_4way_init( &x13sm3_ctx_mid );
+     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13sm3_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c
new file mode 100644
index 0000000..a6280ce
--- /dev/null
+++ b/algo/x13/x13sm3-gate.c
@@ -0,0 +1,18 @@
+#include "x13sm3-gate.h"
+
+bool register_x13sm3_algo( algo_gate_t* gate )
+{
+#if defined (X13SM3_4WAY)
+  init_x13sm3_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3_4way;
+  gate->hash      = (void*)&x13sm3_4way_hash;
+#else
+  init_x13sm3_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3;
+  gate->hash      = (void*)&x13sm3_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h
new file mode 100644
index 0000000..3e35633
--- /dev/null
+++ b/algo/x13/x13sm3-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13SM3_GATE_H__
+#define X13SM3_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13SM3_4WAY
+#endif
+
+bool register_x13sm3_algo( algo_gate_t* gate );
+
+#if defined(X13SM3_4WAY)
+
+void x13sm3_4way_hash( void *state, const void *input );
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13sm3_4way_ctx();
+
+#endif
+
+void x13sm3_hash( void *state, const void *input );
+
+int scanhash_x13sm3( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13sm3_ctx();
+
+#endif
+
diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c
index 41556ba..f07c204 100644
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13sm3-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -49,7 +49,7 @@ typedef struct {
 
 hsr_ctx_holder hsr_ctx;
 
-void init_hsr_ctx()
+void init_x13sm3_ctx()
 {
 #ifdef NO_AES_NI
         sph_groestl512_init(&hsr_ctx.groestl);
@@ -67,7 +67,7 @@ void init_hsr_ctx()
         sph_fugue512_init(&hsr_ctx.fugue);
 };
 
-static void x13sm3hash(void *output, const void *input)
+void x13sm3_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 
@@ -213,7 +213,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x13sm3hash(hash64, endiandata);
+				x13sm3_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
@@ -240,13 +240,3 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 	return 0;
 }
 
-bool register_x13sm3_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_hsr_ctx();
-  gate->scanhash  = (void*)&scanhash_x13sm3;
-  gate->hash      = (void*)&x13sm3hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c
new file mode 100644
index 0000000..29bf8b3
--- /dev/null
+++ b/algo/x14/x14-4way.c
@@ -0,0 +1,310 @@
+#include "x14-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+} x14_4way_ctx_holder;
+
+x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_4way_ctx()
+{
+     blake512_4way_init( &x14_4way_ctx.blake );
+     sph_bmw512_init( &x14_4way_ctx.bmw );
+     init_groestl( &x14_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x14_4way_ctx.skein );
+     jh512_4way_init( &x14_4way_ctx.jh );
+     keccak512_4way_init( &x14_4way_ctx.keccak );
+     init_luffa( &x14_4way_ctx.luffa, 512 );
+     cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_4way_ctx.shavite );
+     init_sd( &x14_4way_ctx.simd, 512 );
+     init_echo( &x14_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x14_4way_ctx.hamsi );
+     sph_fugue512_init( &x14_4way_ctx.fugue );
+     sph_shabal512_init( &x14_4way_ctx.shabal );
+};
+
+void x14_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x14_4way_ctx_holder ctx;
+     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x14_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c
new file mode 100644
index 0000000..3c0a0a8
--- /dev/null
+++ b/algo/x14/x14-gate.c
@@ -0,0 +1,18 @@
+#include "x14-gate.h"
+
+bool register_x14_algo( algo_gate_t* gate )
+{
+#if defined (X14_4WAY)
+  init_x14_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_4way;
+  gate->hash      = (void*)&x14_4way_hash;
+#else
+  init_x14_ctx();
+  gate->scanhash  = (void*)&scanhash_x14;
+  gate->hash      = (void*)&x14hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h
new file mode 100644
index 0000000..127c101
--- /dev/null
+++ b/algo/x14/x14-gate.h
@@ -0,0 +1,32 @@
+#ifndef X14_GATE_H__
+#define X14_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X14_4WAY
+#endif
+
+bool register_x14_algo( algo_gate_t* gate );
+
+#if defined(X14_4WAY)
+
+void x14_4way_hash( void *state, const void *input );
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x14_4way_ctx();
+
+#endif
+
+void x14hash( void *state, const void *input );
+
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x14_ctx();
+
+#endif
+
diff --git a/algo/x14/x14.c b/algo/x14/x14.c
index e76e820..d53f919 100644
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x14-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -72,7 +72,7 @@ void init_x14_ctx()
         sph_shabal512_init(&x14_ctx.shabal);
 };
 
-static void x14hash(void *output, const void *input)
+void x14hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -248,14 +248,3 @@ int scanhash_x14(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x14_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x14_ctx();
-  gate->scanhash  = (void*)&scanhash_x14;
-  gate->hash      = (void*)&x14hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c
new file mode 100644
index 0000000..322a04e
--- /dev/null
+++ b/algo/x15/x15-4way.c
@@ -0,0 +1,329 @@
+#include "x15-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_4way_ctx_holder;
+
+x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_4way_ctx()
+{
+     blake512_4way_init( &x15_4way_ctx.blake );
+     sph_bmw512_init( &x15_4way_ctx.bmw );
+     init_groestl( &x15_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x15_4way_ctx.skein );
+     jh512_4way_init( &x15_4way_ctx.jh );
+     keccak512_4way_init( &x15_4way_ctx.keccak );
+     init_luffa( &x15_4way_ctx.luffa, 512 );
+     cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_4way_ctx.shavite );
+     init_sd( &x15_4way_ctx.simd, 512 );
+     init_echo( &x15_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x15_4way_ctx.hamsi );
+     sph_fugue512_init( &x15_4way_ctx.fugue );
+     sph_shabal512_init( &x15_4way_ctx.shabal );
+     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
+};
+
+void x15_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x15_4way_ctx_holder ctx;
+     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x15_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c
new file mode 100644
index 0000000..75d6134
--- /dev/null
+++ b/algo/x15/x15-gate.c
@@ -0,0 +1,17 @@
+#include "x15-gate.h"
+
+bool register_x15_algo( algo_gate_t* gate )
+{
+#if defined (X15_4WAY)
+  init_x15_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_4way;
+  gate->hash      = (void*)&x15_4way_hash;
+#else
+  init_x15_ctx();
+  gate->scanhash  = (void*)&scanhash_x15;
+  gate->hash      = (void*)&x15hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h
new file mode 100644
index 0000000..5af0043
--- /dev/null
+++ b/algo/x15/x15-gate.h
@@ -0,0 +1,32 @@
+#ifndef X15_GATE_H__
+#define X15_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X15_4WAY
+#endif
+
+bool register_x15_algo( algo_gate_t* gate );
+
+#if defined(X15_4WAY)
+
+void x15_4way_hash( void *state, const void *input );
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x15_4way_ctx();
+
+#endif
+
+void x15hash( void *state, const void *input );
+
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x15_ctx();
+
+#endif
+
diff --git a/algo/x15/x15.c b/algo/x15/x15.c
index b654fed..5e4dd36 100644
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x15-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -74,7 +74,7 @@ void init_x15_ctx()
         sph_whirlpool_init( &x15_ctx.whirlpool );
 };
 
-static void x15hash(void *output, const void *input)
+void x15hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -260,13 +260,3 @@ int scanhash_x15(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x15_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x15_ctx();
-  gate->scanhash = (void*)&scanhash_x15;
-  gate->hash     = (void*)&x15hash;
-  return true;
-};
-
diff --git a/algo/hmq1725.c b/algo/x17/hmq1725.c
similarity index 100%
rename from algo/hmq1725.c
rename to algo/x17/hmq1725.c
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
new file mode 100644
index 0000000..41f088d
--- /dev/null
+++ b/algo/x17/x17-4way.c
@@ -0,0 +1,364 @@
+#include "x17-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    SHA512_CTX              sha512;
+    sph_haval256_5_context  haval;
+} x17_4way_ctx_holder;
+
+x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x17_4way_ctx()
+{
+     blake512_4way_init( &x17_4way_ctx.blake );
+     sph_bmw512_init( &x17_4way_ctx.bmw );
+     init_groestl( &x17_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x17_4way_ctx.skein );
+     jh512_4way_init( &x17_4way_ctx.jh );
+     keccak512_4way_init( &x17_4way_ctx.keccak );
+     init_luffa( &x17_4way_ctx.luffa, 512 );
+     cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x17_4way_ctx.shavite );
+     init_sd( &x17_4way_ctx.simd, 512 );
+     init_echo( &x17_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x17_4way_ctx.hamsi );
+     sph_fugue512_init( &x17_4way_ctx.fugue );
+     sph_shabal512_init( &x17_4way_ctx.shabal );
+     sph_whirlpool_init( &x17_4way_ctx.whirlpool );
+     SHA512_Init( &x17_4way_ctx.sha512 );
+     sph_haval256_5_init( &x17_4way_ctx.haval );
+};
+
+void x17_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x17_4way_ctx_holder ctx;
+     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     // 16 SHA512 
+     SHA512_Update( &ctx.sha512, hash0, 64 );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, 64 );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, 64 );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, 64 );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     // 17 Haval
+     sph_haval256_5( &ctx.haval, (const void*)hash0, 64 );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, 64 );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, 64 );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, 64 );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x17_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c
new file mode 100644
index 0000000..f28d686
--- /dev/null
+++ b/algo/x17/x17-gate.c
@@ -0,0 +1,17 @@
+#include "x17-gate.h"
+
+bool register_x17_algo( algo_gate_t* gate )
+{
+#if defined (X17_4WAY)
+  init_x17_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x17_4way;
+  gate->hash      = (void*)&x17_4way_hash;
+#else
+  init_x17_ctx();
+  gate->scanhash  = (void*)&scanhash_x17;
+  gate->hash      = (void*)&x17_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h
new file mode 100644
index 0000000..7767fd0
--- /dev/null
+++ b/algo/x17/x17-gate.h
@@ -0,0 +1,32 @@
+#ifndef X17_GATE_H__
+#define X17_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X17_4WAY
+#endif
+
+bool register_x17_algo( algo_gate_t* gate );
+
+#if defined(X17_4WAY)
+
+void x17_4way_hash( void *state, const void *input );
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x17_4way_ctx();
+
+#endif
+
+void x17_hash( void *state, const void *input );
+
+int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x17_ctx();
+
+#endif
+
diff --git a/algo/x17/x17.c b/algo/x17/x17.c
index ff3a876..a377492 100644
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x17-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -86,7 +86,7 @@ void init_x17_ctx()
         sph_haval256_5_init(&x17_ctx.haval);
 };
 
-static void x17hash(void *output, const void *input)
+void x17_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (64)));
 	#define hashB hash+64
@@ -248,7 +248,7 @@ int scanhash_x17(int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x17hash(hash64, endiandata);
+				x17_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if (!(hash64[7] & mask))
                                 {
@@ -281,7 +281,7 @@ int scanhash_x17(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_x17_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -290,4 +290,4 @@ bool register_x17_algo( algo_gate_t* gate )
   gate->hash     = (void*)&x17hash;
   return true;
 };
-
+*/
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
new file mode 100644
index 0000000..c85b594
--- /dev/null
+++ b/algo/x17/xevan-4way.c
@@ -0,0 +1,556 @@
+#include "xevan-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sph_sha2.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+        blake512_4way_context   blake;
+        sph_bmw512_context      bmw;
+        hashState_groestl       groestl;
+        skein512_4way_context   skein;
+        jh512_4way_context      jh;
+        keccak512_4way_context  keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        hashState_echo          echo;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_haval256_5_context  haval;
+} xevan_4way_ctx_holder;
+
+xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context xevan_blake_4way_mid
+                                        __attribute__ ((aligned (64)));
+
+void init_xevan_4way_ctx()
+{
+        blake512_4way_init(&xevan_4way_ctx.blake);
+        sph_bmw512_init(&xevan_4way_ctx.bmw);
+        init_groestl( &xevan_4way_ctx.groestl, 64 );
+        skein512_4way_init(&xevan_4way_ctx.skein);
+        jh512_4way_init(&xevan_4way_ctx.jh);
+        keccak512_4way_init(&xevan_4way_ctx.keccak);
+        init_luffa( &xevan_4way_ctx.luffa, 512 );
+        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
+        sph_shavite512_init( &xevan_4way_ctx.shavite );
+        init_sd( &xevan_4way_ctx.simd, 512 );
+        init_echo( &xevan_4way_ctx.echo, 512 );
+        sph_hamsi512_init( &xevan_4way_ctx.hamsi );
+        sph_fugue512_init( &xevan_4way_ctx.fugue );
+        sph_shabal512_init( &xevan_4way_ctx.shabal );
+        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
+        SHA512_Init( &xevan_4way_ctx.sha512 );
+        sph_haval256_5_init( &xevan_4way_ctx.haval );
+};
+
+void xevan_4way_blake512_midstate( const void* input )
+{
+    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
+            sizeof(xevan_blake_4way_mid) );
+    blake512_4way( &xevan_blake_4way_mid, input, 64 );
+}
+
+void xevan_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     const int midlen = 64;            // bytes
+     const int tail   = 80 - midlen;   // 16
+     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     memcpy( &ctx.blake, &xevan_blake_4way_mid,
+             sizeof(xevan_blake_4way_mid) );
+     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     blake512_4way_close(&ctx.blake, vhash);
+
+     memset( &vhash[8<<2], 0, 64<<2 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_bmw512( &ctx.bmw, hash0, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_shabal512( &ctx.shabal, hash0, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal, 
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     blake512_4way( &ctx.blake, vhash, dataLen );
+     blake512_4way_close(&ctx.blake, vhash);
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_bmw512( &ctx.bmw, hash0, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_shabal512( &ctx.shabal, hash0, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+//	uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   for ( int k=0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   xevan_4way_blake512_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      xevan_4way_hash( hash, vdata );
+
+      pdata[19] = n;
+
+      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce )
+             && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
new file mode 100644
index 0000000..b76cdb4
--- /dev/null
+++ b/algo/x17/xevan-gate.c
@@ -0,0 +1,24 @@
+#include "xevan-gate.h"
+
+void xevan_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_xevan_algo( algo_gate_t* gate )
+{
+#if defined (XEVAN_4WAY)
+  init_xevan_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan_4way;
+  gate->hash      = (void*)&xevan_4way_hash;
+#else
+  init_xevan_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan;
+  gate->hash      = (void*)&xevan_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->set_target = (void*)&xevan_set_target;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h
new file mode 100644
index 0000000..51f7716
--- /dev/null
+++ b/algo/x17/xevan-gate.h
@@ -0,0 +1,32 @@
+#ifndef XEVAN_GATE_H__
+#define XEVAN_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define XEVAN_4WAY
+#endif
+
+bool register_xevan_algo( algo_gate_t* gate );
+
+#if defined(XEVAN_4WAY)
+
+void xevan_4way_hash( void *state, const void *input );
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_xevan_4way_ctx();
+
+#endif
+
+void xevan_hash( void *state, const void *input );
+
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_xevan_ctx();
+
+#endif
+
diff --git a/algo/xevan.c b/algo/x17/xevan.c
similarity index 94%
rename from algo/xevan.c
rename to algo/x17/xevan.c
index a5c225f..f3c4f9d 100644
--- a/algo/xevan.c
+++ b/algo/x17/xevan.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "xevan-gate.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -286,19 +286,3 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
 	return 0;
 }
 
-void xevan_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_xevan_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_xevan_ctx();
-  gate->scanhash   = (void*)&scanhash_xevan;
-  gate->hash       = (void*)&xevan_hash;
-  gate->set_target = (void*)&xevan_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};
-
diff --git a/algo/yescrypt/yescrypt-opt.c b/algo/yescrypt/yescrypt-opt.c
deleted file mode 100644
index fb53573..0000000
--- a/algo/yescrypt/yescrypt-opt.c
+++ /dev/null
@@ -1,935 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "sha256_Y.h"
-#include "sysendian.h"
-
-#include "yescrypt-platform.h"
-
-static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ = *src++; *dest++ = *src++;
-		*dest++ = *src++; *dest++ = *src++;
-	} while (count -= 4);
-}
-
-static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ ^= *src++; *dest++ ^= *src++;
-		*dest++ ^= *src++; *dest++ ^= *src++;
-	} while (count -= 4);
-}
-
-typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
-} salsa20_blk_t;
-
-static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
-	COMBINE(0, 0, 2)
-	COMBINE(1, 5, 7)
-	COMBINE(2, 2, 4)
-	COMBINE(3, 7, 1)
-	COMBINE(4, 4, 6)
-	COMBINE(5, 1, 3)
-	COMBINE(6, 6, 0)
-	COMBINE(7, 3, 5)
-#undef COMBINE
-}
-
-static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
-	COMBINE(0, 0, 6)
-	COMBINE(1, 5, 3)
-	COMBINE(2, 2, 0)
-	COMBINE(3, 7, 5)
-	COMBINE(4, 4, 2)
-	COMBINE(5, 1, 7)
-	COMBINE(6, 6, 4)
-	COMBINE(7, 3, 1)
-#undef COMBINE
-}
-
-/**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
- */
-static void salsa20_8(uint64_t B[8])
-{
-	size_t i;
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
-
-	for (i = 0; i < 8; i += 2) {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	}
-#undef x
-
-	{
-		salsa20_blk_t Y;
-		salsa20_simd_shuffle(&X, &Y);
-		for (i = 0; i < 16; i += 4) {
-			((salsa20_blk_t *)B)->w[i] += Y.w[i];
-			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
-			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
-			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
-		}
-	}
-}
-
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
-{
-	size_t i;
-
-	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < 2 * r; i += 2) {
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4], X, 8);
-
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8 + 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4 + r * 8], X, 8);
-	}
-}
-
-/* These are tunable */
-#define S_BITS 8
-#define S_SIMD 2
-#define S_P 4
-#define S_ROUNDS 6
-
-/* Number of S-boxes.  Not tunable, hard-coded in a few places. */
-#define S_N 2
-
-/* Derived values.  Not tunable on their own. */
-#define S_SIZE1 (1 << S_BITS)
-#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
-#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
-#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
-#define S_P_SIZE (S_P * S_SIMD)
-#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
-
-/**
- * pwxform(B):
- * Transform the provided block using the provided S-boxes.
- */
-static void block_pwxform(uint64_t * B, const uint64_t * S)
-{
-	uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
-	const uint8_t *S0 = (const uint8_t *)S;
-	const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
-	size_t i, j;
-#if S_SIMD > 2
-	size_t k;
-#endif
-
-	for (j = 0; j < S_P; j++) {
-		uint64_t *Xj = X[j];
-		uint64_t x0 = Xj[0];
-#if S_SIMD > 1
-		uint64_t x1 = Xj[1];
-#endif
-
-		for (i = 0; i < S_ROUNDS; i++) {
-			uint64_t x = x0 & S_MASK2;
-			const uint64_t *p0, *p1;
-
-			p0 = (const uint64_t *)(S0 + (uint32_t)x);
-			p1 = (const uint64_t *)(S1 + (x >> 32));
-
-			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
-			x0 += p0[0];
-			x0 ^= p1[0];
-
-#if S_SIMD > 1
-			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
-			x1 += p0[1];
-			x1 ^= p1[1];
-#endif
-
-#if S_SIMD > 2
-			for (k = 2; k < S_SIMD; k++) {
-				x = Xj[k];
-
-				x = (uint64_t)(x >> 32) * (uint32_t)x;
-				x += p0[k];
-				x ^= p1[k];
-
-				Xj[k] = x;
-			}
-#endif
-		}
-
-		Xj[0] = x0;
-#if S_SIMD > 1
-		Xj[1] = x1;
-#endif
-	}
-}
-
-/**
- * blockmix_pwxform(Bin, Bout, S, r):
- * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin).  The input Bin must
- * be 128r bytes in length; the output Bout must also be the same size.
- *
- * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
- * need to refer to both functions via the same function pointers.
- */
-static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
-{
-	size_t r1, r2, i;
-
-	/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
-	r1 = r * 128 / (S_P_SIZE * 8);
-
-	/* X <-- B_{r1 - 1} */
-	blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
-
-	/* X <-- X \xor B_i */
-	blkxor(Bout, Bin, S_P_SIZE);
-
-	/* X <-- H'(X) */
-	/* B'_i <-- X */
-	block_pwxform(Bout, S);
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 1; i < r1; i++) {
-		/* X <-- X \xor B_i */
-		blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
-		    S_P_SIZE);
-		blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
-
-		/* X <-- H'(X) */
-		/* B'_i <-- X */
-		block_pwxform(&Bout[i * S_P_SIZE], S);
-	}
-
-	/* Handle partial blocks */
-	if (i * S_P_SIZE < r * 16)
-		blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
-		    r * 16 - i * S_P_SIZE);
-
-	i = (r1 - 1) * S_P_SIZE / 8;
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r2 = r * 2;
-
-	/* B'_i <-- H(B'_i) */
-	salsa20_8(&Bout[i * 8]);
-	i++;
-
-	for (; i < r2; i++) {
-		/* B'_i <-- H(B'_i \xor B'_{i-1}) */
-		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
-		salsa20_8(&Bout[i * 8]);
-	}
-}
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static __inline uint64_t
-integerify(const uint64_t * B, size_t r)
-{
-/*
- * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
- * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
- * in host byte order, as it should be.
- */
-	const uint64_t * X = &B[(2 * r - 1) * 8];
-	uint32_t lo = (uint32_t) X[0];
-	uint32_t hi = (uint32_t) (X[6] >> 32);
-	return ((uint64_t)hi << 32) + lo;
-}
-
-/**
- * smix1(B, r, N, flags, V, NROM, shared, XY, S):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be even and
- * no smaller than 2.
- */
-static void
-smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1;
-	size_t s = 16 * r;
-	uint64_t * X = V;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t n, i, j;
-	size_t k;
-
-	/* 1: X <-- B */
-	/* 3: V_i <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	/* 4: X <-- H(X) */
-	/* 3: V_i <-- X */
-	blockmix(X, Y, Z, r);
-	blkcpy(&V[s], Y, s);
-
-	X = XY;
-
-	if (NROM && (VROM_mask & 1)) {
-		if ((1 & VROM_mask) == 1) {
-			/* j <-- Integerify(X) mod NROM */
-			j = integerify(Y, r) & (NROM - 1);
-
-			/* X <-- H(X \xor VROM_j) */
-			blkxor(Y, &VROM[j * s], s);
-		}
-
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if ((i & (i - 1)) == 0)
-				n <<= 1;
-
-			/* j <-- Wrap(Integerify(X), i) */
-			j = integerify(X, r) & (n - 1);
-			j += i - n;
-
-			/* X <-- X \xor V_j */
-			blkxor(X, &V[j * s], s);
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i + 1 - n;
-
-				/* X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		yescrypt_flags_t rw = flags & YESCRYPT_RW;
-
-		/* 4: X <-- H(X) */
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if (rw) {
-				if ((i & (i - 1)) == 0)
-					n <<= 1;
-
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(X, r) & (n - 1);
-				j += i - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(X, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			if (rw) {
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(Y, r) & (n - 1);
-				j += (i + 1) - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(Y, X, Z, r);
-		}
-	}
-
-	/* B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be a
- * power of 2 greater than 1.  The value Nloop must be even.
- */
-static void
-smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1 | 1;
-	size_t s = 16 * r;
-	yescrypt_flags_t rw = flags & YESCRYPT_RW;
-	uint64_t * X = XY;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t i, j;
-	size_t k;
-
-	if (Nloop == 0)
-		return;
-
-	/* X <-- B' */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	if (NROM) {
-		/* 6: for i = 0 to N - 1 do */
-		for (i = 0; i < Nloop; i += 2) {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* 7: j <-- Integerify(X) mod N */
-				j &= N - 1;
-
-				/* 8: X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-				/* V_j <-- Xprev \xor V_j */
-				if (rw)
-					blkcpy(&V[j * s], Y, s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		/* 6: for i = 0 to N - 1 do */
-		i = Nloop / 2;
-		do {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(Y, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(Y, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], Y, s);
-			blockmix(Y, X, Z, r);
-		} while (--i);
-	}
-
-	/* 10: B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * p2floor(x):
- * Largest power of 2 not greater than argument.
- */
-static uint64_t
-p2floor(uint64_t x)
-{
-	uint64_t y;
-	while ((y = x & (x - 1)))
-		x = y;
-	return x;
-}
-
-/**
- * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage
- * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
- * required with OpenMP-enabled builds).  The value N must be a power of 2
- * greater than 1.
- */
-static void
-smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	size_t s = 16 * r;
-	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
-	uint32_t i;
-
-	Nloop_all = Nchunk;
-	if (flags & YESCRYPT_RW) {
-		if (t <= 1) {
-			if (t)
-				Nloop_all *= 2; /* 2/3 */
-			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
-		} else {
-			Nloop_all *= t - 1;
-		}
-	} else if (t) {
-		if (t == 1)
-			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
-		Nloop_all *= t;
-	}
-
-	Nloop_rw = 0;
-	if (flags & __YESCRYPT_INIT_SHARED)
-		Nloop_rw = Nloop_all;
-	else if (flags & YESCRYPT_RW)
-		Nloop_rw = Nloop_all / p;
-
-	Nchunk &= ~(uint64_t)1; /* round down to even */
-	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
-	Nloop_rw &= ~(uint64_t)1; /* round down to even */
-
-#ifdef _OPENMP
-#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
-	{
-#pragma omp for
-#endif
-	for (i = 0; i < p; i++) {
-		uint64_t Vchunk = i * Nchunk;
-		uint64_t * Bp = &B[i * s];
-		uint64_t * Vp = &V[Vchunk * s];
-#ifdef _OPENMP
-		uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-		uint64_t * XYp = XY;
-#endif
-		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
-		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-		if (Sp)
-			smix1(Bp, 1, S_SIZE_ALL / 16,
-			    flags & ~YESCRYPT_PWXFORM,
-			    Sp, NROM, shared, XYp, NULL);
-		if (!(flags & __YESCRYPT_INIT_SHARED_2))
-			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
-		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
-		    NROM, shared, XYp, Sp);
-	}
-
-	if (Nloop_all > Nloop_rw) {
-#ifdef _OPENMP
-#pragma omp for
-#endif
-		for (i = 0; i < p; i++) {
-			uint64_t * Bp = &B[i * s];
-#ifdef _OPENMP
-			uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-			uint64_t * XYp = XY;
-#endif
-			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-			smix2(Bp, r, N, Nloop_all - Nloop_rw,
-			    flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
-		}
-	}
-#ifdef _OPENMP
-	}
-#endif
-}
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters r, p, and buflen must satisfy
- * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
- * of 2 greater than 1.
- *
- * t controls computation time while not affecting peak memory usage.  shared
- * and flags may request special modes as described in yescrypt.h.  local is
- * the thread-local data structure, allowing to preserve and reuse a memory
- * allocation across calls, thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- */
-int
-yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
-    const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen,
-    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_region_t tmp;
-	uint64_t NROM;
-	size_t B_size, V_size, XY_size, need;
-	uint64_t * B, * V, * XY, * S;
-	uint64_t sha256[4];
-
-	/*
-	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
-	 * so don't let it have side-effects.  Without this adjustment, it'd
-	 * enable the SHA-256 password pre-hashing and output post-hashing,
-	 * because any deviation from classic scrypt implies those.
-	 */
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	/* Sanity-check parameters */
-	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
-		errno = EINVAL;
-		return -1;
-	}
-#if SIZE_MAX > UINT32_MAX
-	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
-		errno = EFBIG;
-		return -1;
-	}
-#endif
-	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
-		errno = EFBIG;
-		return -1;
-	}
-	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-#if S_MIN_R > 1
-	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
-		errno = EINVAL;
-		return -1;
-	}
-#endif
-	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
-#if SIZE_MAX / 256 <= UINT32_MAX
-	    (r > SIZE_MAX / 256) ||
-#endif
-	    (N > SIZE_MAX / 128 / r)) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
-		errno = EFBIG;
-		return -1;
-	}
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
-	    (N > SIZE_MAX / 128 / (r * p))) {
-		errno = ENOMEM;
-		return -1;
-	}
-#endif
-	if ((flags & YESCRYPT_PWXFORM) &&
-#ifndef _OPENMP
-	    (flags & YESCRYPT_PARALLEL_SMIX) &&
-#endif
-	    p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
-		errno = ENOMEM;
-		return -1;
-	}
-
-	NROM = 0;
-	if (shared->shared1.aligned) {
-		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
-		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
-		    !(flags & YESCRYPT_RW)) {
-			errno = EINVAL;
-			return -1;
-		}
-	}
-
-	/* Allocate memory */
-	V = NULL;
-	V_size = (size_t)128 * r * N;
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX))
-		V_size *= p;
-#endif
-	need = V_size;
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (local->aligned_size < need) {
-			if (local->base || local->aligned ||
-			    local->base_size || local->aligned_size) {
-				errno = EINVAL;
-				return -1;
-			}
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		V = (uint64_t *)local->aligned;
-		need = 0;
-	}
-	B_size = (size_t)128 * r * p;
-	need += B_size;
-	if (need < B_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	XY_size = (size_t)256 * r + 64;
-#ifdef _OPENMP
-	XY_size *= p;
-#endif
-	need += XY_size;
-	if (need < XY_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (flags & YESCRYPT_PWXFORM) {
-		size_t S_size = S_SIZE_ALL * sizeof(*S);
-#ifdef _OPENMP
-		S_size *= p;
-#else
-		if (flags & YESCRYPT_PARALLEL_SMIX)
-			S_size *= p;
-#endif
-		need += S_size;
-		if (need < S_size) {
-			errno = ENOMEM;
-			return -1;
-		}
-	}
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (!alloc_region(&tmp, need))
-			return -1;
-		B = (uint64_t *)tmp.aligned;
-		XY = (uint64_t *)((uint8_t *)B + B_size);
-	} else {
-		init_region(&tmp);
-		if (local->aligned_size < need) {
-			if (free_region(local))
-				return -1;
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		B = (uint64_t *)local->aligned;
-		V = (uint64_t *)((uint8_t *)B + B_size);
-		XY = (uint64_t *)((uint8_t *)V + V_size);
-	}
-	S = NULL;
-	if (flags & YESCRYPT_PWXFORM)
-		S = (uint64_t *)((uint8_t *)XY + XY_size);
-
-	if (t || flags) {
-		SHA256_CTX_Y ctx;
-		SHA256_Init_Y(&ctx);
-		SHA256_Update_Y(&ctx, passwd, passwdlen);
-		SHA256_Final_Y((uint8_t *)sha256, &ctx);
-		passwd = (uint8_t *)sha256;
-		passwdlen = sizeof(sha256);
-	}
-
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
-	    (uint8_t *)B, B_size);
-
-	if (t || flags)
-		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
-
-	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
-		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
-	} else {
-		uint32_t i;
-
-		/* 2: for i = 0 to p - 1 do */
-#ifdef _OPENMP
-#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
-#endif
-		for (i = 0; i < p; i++) {
-			/* 3: B_i <-- MF(B_i, N) */
-#ifdef _OPENMP
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
-			    &V[(size_t)16 * r * i * N],
-			    NROM, shared,
-			    &XY[((size_t)32 * r + 8) * i],
-			    S ? &S[S_SIZE_ALL * i] : S);
-#else
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
-			    NROM, shared, XY, S);
-#endif
-		}
-	}
-
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
-
-	/*
-	 * Except when computing classic scrypt, allow all computation so far
-	 * to be performed on the client.  The final steps below match those of
-	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
-	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
-	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
-	 */
-	if ((t || flags) && buflen == sizeof(sha256)) {
-		/* Compute ClientKey */
-		{
-			HMAC_SHA256_CTX ctx;
-			HMAC_SHA256_Init(&ctx, buf, buflen);
-			HMAC_SHA256_Update(&ctx, salt, saltlen);
-			HMAC_SHA256_Final((uint8_t *)sha256, &ctx);
-		}
-		/* Compute StoredKey */
-		{
-			SHA256_CTX_Y ctx;
-			SHA256_Init_Y(&ctx);
-			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
-			SHA256_Final_Y(buf, &ctx);
-		}
-	}
-
-	if (free_region(&tmp))
-		return -1;
-
-	/* Success! */
-	return 0;
-}
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
index 0eb36a7..18558a4 100644
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -426,7 +426,7 @@ int64_t yescryptr16_get_max64()
 
 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yescrypt;
    gate->hash       = (void*)&yescrypt_hash;
    gate->set_target = (void*)&scrypt_set_target;
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 
 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yescrypt;
    gate->hash       = (void*)&yescrypt_hash;
    gate->set_target = (void*)&scrypt_set_target;
diff --git a/avxdefs.h b/avxdefs.h
index b7b686a..0662519 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,71 +1,96 @@
 #ifndef AVXDEFS_H__
 #define AVXDEFS_H__
 
-// Some tools to help using AVX and AVX2
-// At this time SSE2 is sufficient for all 128 bit code in this file.
+// Some tools to help using AVX and AVX2.
+// At this time SSE2 is sufficient for all 128 bit code in this file
+// but could change without notice.
 // 256 bit requires AVX2.
+// AVX512 has more powerful 256 bit instructions but with AVX512 available
+// there is little reason to use them.
+// Proper alignment of data is required, 16 bytes for 128 bit vectors and
+// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
+// best cache alignment.
+//
+// There exist dupplicates of some functions. In general the first defined
+// is preferred as it is more efficient but also more restrictive and may
+// not be applicable. The less efficient versions are more flexible.
 
 #include <inttypes.h>
 #include <immintrin.h>
 #include <memory.h>
+#include <stdbool.h>
 
 //
 // 128 bit utilities and shortcuts
 
+//
+// Pseudo constants, there are no real vector constants.
+// These can't be used for compile time initialization.
+
 // Constant zero
-#define mm_zero _mm_setzero_si128()
+#define mm_zero      _mm_setzero_si128()
+
+// Constant 1
+#define mm_one_128   _mm_set_epi64x(  0ULL, 1ULL )
+#define mm_one_64    _mm_set1_epi64x( 1ULL )
+#define mm_one_32    _mm_set1_epi32(  1UL )
+#define mm_one_16    _mm_set1_epi16(  1U )
 
 // Constant minus 1
-#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFF )
+#define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFUL )
+
+//
+// Basic operations without equivalent SIMD intrinsic
 
 // Bitwise not (~x)
-#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 ) 
+#define mm_not( x )  _mm_xor_si128( (x), mm_neg1 ) 
 
 // Unary negation (-a)
 #define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a )
 #define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )  
+#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a )  
 
 //
-// Bit operations, functional but not very efficient
+// Bit operations
 
-// Return x with bit n set/clear in all elements
-#define mm_bitset_128( x, n ) \
-   _mm_or_si128( _mm_slli_si128( _mm_set_epi64x( 0ULL, 1ULL ), n ) )
-
-#define mm_bitclr_128( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_si128( \
-                                     _mm_set_epi64x( 0ULL, 1ULL ), n ) ) )
-
-#define mm_bitset_64( x, n ) \
-   _mm_or_si128( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) )
-
-#define mm_bitclr_64( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) ) )
-
-#define mm_bitset_32( x, n ) \
-   _mm_or_si128( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) )
-
-#define mm_bitclr_32( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) ) )
-
-#define mm_bitset_16( x, n ) \
-   _mm_or_si128( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) )
-
-#define mm_bitclr_16( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) ) )
-
-// return vector of bool
-#define mm_bittest_128( x, n ) \
-   _mm_and_si256( _mm_srli_si128( x, n ), _mm_set_epi64x( 0ULL, 1ULL ) ) 
+// Return bit n in position, all other bits zeroed.
+#define mm_bitextract_64 ( x, n ) \
+   _mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
+#define mm_bitextract_32 ( x, n ) \
+   _mm_and_si128(  _mm_set1_epi32( 1UL << (n) ), x )
+#define mm_bitextract_16 ( x, n ) \
+   _mm_and_si128(  _mm_set1_epi16( 1U << (n) ), x )
 
+// Return bit n as bool
 #define mm_bittest_64( x, n ) \
-   _mm_and_si256( _mm_srli_epi64( x, n ), _mm_set1_epi64x( 1ULL ) ) 
-
+   _mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) ) 
 #define mm_bittest_32( x, n ) \
-   _mm_and_si256( _mm_srli_epi32( x, n ), _mm_set1_epi32( 1UL ) ) 
-
+   _mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) ) 
 #define mm_bittest_16( x, n ) \
-   _mm_and_si256( _mm_srli_epi16( x, n ), _mm_set1_epi16( 1U ) ) 
+   _mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) ) 
+
+// Return x with bit n set/cleared in all elements
+#define mm_bitset_64( x, n ) \
+   _mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitclr_64( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitset_32( x, n ) \
+   _mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitclr_32( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitset_16( x, n ) \
+   _mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x )
+#define mm_bitclr_16( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x )
+
+// Return x with bit n toggled
+#define mm_bitflip_64( x, n ) \
+   _mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitflip_32( x, n ) \
+   _mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitflip_16( x, n ) \
+   _mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x )
+
 
 //
 // Memory functions
@@ -86,13 +111,33 @@ inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
    for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }
 
-// Scalar 64 bit copy, n = bytes/8
-inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
+// Compare data in memory, return true if different
+inline bool memcmp_128( __m128i src1, __m128i src2, int n )
 {
    for ( int i = 0; i < n; i++ )
-       dst[i] = src[i];
+     if ( src1[i] != src2[i] ) return true;
+   return false;
 }
 
+// A couple of 64 bit scalar functions
+// n = bytes/8
+
+inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = src[i];
+}
+
+inline void memset_zero_64( uint64_t *src, int n )
+{
+   for ( int i = 0; i < n; i++ ) src[i] = 0;
+}
+
+inline void memset_64( uint64_t *dst, uint64_t a,  int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = a;
+}
+
+
 //
 // Pointer cast
 
@@ -108,149 +153,136 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
 // returns p[i]
 #define casti_m128i(p,i) (((__m128i*)(p))[(i)])
 
+//
+// Bit rotations
+
+// XOP is an obsolete AMD feature that has native rotation. 
+//    _mm_roti_epi64( w, c)
+// Never implemented by Intel and since removed from Zen by AMD.
 
 // Rotate bits in vector elements
-
 #define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
-                                         _mm_slli_epi64( w, 64-c ) )
-
+                                         _mm_slli_epi64( w, 64-(c) ) )
 #define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
-                                         _mm_srli_epi64( w, 64-c ) )
-
+                                         _mm_srli_epi64( w, 64-(c) ) )
 #define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
-                                         _mm_slli_epi32( w, 32-c ) )
-
+                                         _mm_slli_epi32( w, 32-(c) ) )
 #define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
-                                         _mm_srli_epi32( w, 32-c ) )
-
+                                         _mm_srli_epi32( w, 32-(c) ) )
 #define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \
-                                         _mm_slli_epi16( w, 16-c ) )
-
+                                         _mm_slli_epi16( w, 16-(c) ) )
 #define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \
-                                         _mm_srli_epi16( w, 16-c ) )
+                                         _mm_srli_epi16( w, 16-(c) ) )
 
 //
-// Shuffle vector elements
+// Rotate elements in vector
 
-// Swap upper and lower 64 bits of 128 bit source vector
-#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
+// Optimized shuffle
 
-// Rotate 128 vector by 1 32 bit element.
+// Swap hi/lo 64 bits in 128 bit vector
+#define mm_swap_64( w )    _mm_shuffle_epi32( w, 0x4e )
+
+// rotate 128 bit vector by 32 bits
 #define mm_rotr_1x32( w )  _mm_shuffle_epi32( w, 0x39 )
 #define mm_rotl_1x32( w )  _mm_shuffle_epi32( w, 0x93 )
 
-// Shuffle elements across two 128 bit vectors
+// Swap hi/lo 32 bits in each 64 bit element
+#define mm_swap64_32( x )  _mm_shuffle_epi32( x, 0xb1 )
 
-// Swap 128 bit source vectors in place.
+// Less efficient but more versatile. Use only for odd number rotations.
+// Use shuffle above when possible.
+
+// Rotate vector by n bytes.
+#define mm_rotr128_x8( w, n ) \
+     _mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) )
+#define mm_rotl128_x8( w, n ) \
+     _mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) )
+
+// Rotate vector by c elements, use only for odd number rotations
+#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 ) 
+#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 )
+#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 ) 
+#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 )
+
+//
+// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo}
+
+// Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits.
 // void mm128_swap128( __m128i, __m128i )
-#define mm_swap_128(hi, lo) hi = _mm_xor_si128(hi, lo); \
-                            lo = _mm_xor_si128(hi, lo); \
-                            hi = _mm_xor_si128(hi, lo);
-
-// Rotate two 128 bit vectors in place as one 256 vector by 1 element
-#define mm_rotl256_1x64( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x64( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotl256_1x32( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x32( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- s0 = t; \
-} while(0)
-
-// Older slower
-#define mm_rotl256_1x64x( s0, s1 ) \
-do { \
-   __m128i t; \
-   s0 = mm_swap_64( s0 ); \
-   s1 = mm_swap_64( s1 ); \
-   t = _mm_or_si128( \
-           _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-           _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s1 = _mm_or_si128( \
-           _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-           _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x64x( s0, s1 ) \
-do { \
-   __m128i t; \
-   s0 = mm_swap_64( s0 ) ; \
-   s1 = mm_swap_64( s1 ); \
-   t = _mm_or_si128( \
-          _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-          _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s1 = _mm_or_si128( \
-          _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-          _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s0 = t; \
-} while(0)
-
-// need a better name, not rot, poke? step?
-// Return s0 with elements shifted right/left and low/high element from
-// s1 shifted into the vacated high/low element of s0.
-// Partially rotate elements in two 128 bit vectors as one 256 bit vector
-// and return the rotated s0.
-// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
-// completed. It's faster than a full rotation.
-
-inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
-{
-   return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
-                        _mm_slli_si128( s1, 16 - (n<<2) ) );
+#define mm_swap_128(hi, lo) \
+{ \
+   hi = _mm_xor_si128(hi, lo); \
+   lo = _mm_xor_si128(hi, lo); \
+   hi = _mm_xor_si128(hi, lo); \
 }
 
-inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
+// Rotate two 128 bit vectors in place as one 256 vector by 1 element
+#define mm_rotl256_1x64( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotr256_1x64( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotl256_1x32( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                          0ul,          0ul,          0ul, 0xfffffffful )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotr256_1x32( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                          0ul,          0ul,          0ul, 0xfffffffful )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
+ hi = t; \
+} while(0)
+
+// Return hi 128 bits with elements shifted one lane with vacated lane filled
+// with data rotated from lo.
+// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// and return the rotated high 128 bits.
+// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
+// completed. It's faster than a full rotation.
+
+inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
 {
-   return _mm_or_si128( _mm_slli_si128( s0, n<<2 ), 
-                        _mm_srli_si128( s1, 16 - (n<<2) ) );
+   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
+                        _mm_slli_si128( lo, 16 - (n<<2) ) );
+}
+
+inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n )
+{
+   return _mm_or_si128( _mm_slli_si128( hi, n<<2 ), 
+                        _mm_srli_si128( lo, 16 - (n<<2) ) );
 }
 
 //
 // Swap bytes in vector elements
 
-inline __m128i mm_byteswap_32( __m128i x )
-{
-  return _mm_shuffle_epi8( x, _mm_set_epi8(
-                           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
-                           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
-}
-
 inline __m128i mm_byteswap_64( __m128i x )
 {
   return _mm_shuffle_epi8( x, _mm_set_epi8(
@@ -258,96 +290,95 @@ inline __m128i mm_byteswap_64( __m128i x )
                            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
 }
 
-// older slower
-inline __m128i  mm_byteswap_32x( __m128i x )
+inline __m128i mm_byteswap_32( __m128i x )
 {
-  __m128i x1 = _mm_and_si128( x, _mm_set1_epi32( 0x0000ff00 ) );
-  __m128i x2 = _mm_and_si128( x, _mm_set1_epi32( 0x00ff0000 ) );
-  __m128i x0 = _mm_slli_epi32( x, 24 );   // x0 = x << 24
-          x1 = _mm_slli_epi32( x1, 8 );   // x1 = mask(x) << 8
-          x2 = _mm_srli_epi32( x2, 8 );   // x2 = mask(x) >> 8
-  __m128i x3 = _mm_srli_epi32( x, 24 );   // x3 = x >> 24
-  return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
+  return _mm_shuffle_epi8( x, _mm_set_epi8(
+                           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
+                           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }
 
-inline __m128i mm_byteswap_64x( __m128i x )
+inline __m128i mm_byteswap_16( __m128i x )
 {
-  x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
-
-  x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
-                             _mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
-                          _mm_slli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
-
-   return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
-                          _mm_slli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
+  return _mm_shuffle_epi8( x, _mm_set_epi8(
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
 }
 
+/////////////////////////////////////////////////////////////////////
+
 #if defined (__AVX2__)
 
 //
 // 256 bit utilities and Shortcuts
 
+//
+// Pseudo constants, there are no real vector constants.
+// These can't be used for compile time initialization
+
 // Constant zero
 #define mm256_zero _mm256_setzero_si256()
 
+// Constant 1
+#define mm256_one_128        _mm256_set_epi64x(  0ULL, 1ULL, 0ULL, 1ULL )
+#define mm256_one_64         _mm256_set1_epi64x( 1ULL )
+#define mm256_one_32         _mm256_set1_epi32(  1UL )
+#define mm256_one_16         _mm256_set1_epi16(  1U )
+
 // Constant minus 1
-#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFF )
+#define mm256_neg1           _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
+
+//
+// Basic operations without SIMD equivalent
 
 // Bitwise not ( ~x )
-#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
+#define mm256_not( x )       _mm256_xor_si256( (x), mm256_neg1 ) \
 
 // Unary negation ( -a )
 #define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a )
 #define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )  
+#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a )  
 
 //
 // Bit operations
 
-// Return x with bit n set/clear in all elements
-#define mm256_bitset_128( x, n ) \
-   _mm256_or_si256( _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) )
-
-#define mm256_bitclr_128( x, n ) \
-   _mm256_and_si256( x, mm256_not( \
-                        _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) ) )
- 
-#define mm256_bitset_64( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi64x( 1ULL << n ) )
-
-#define mm256_bitclr_64( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi64x( 1ULL << n ) ) )
-
-#define mm256_bitset_32( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi32( 1UL << n ) )
-
-#define mm256_bitclr_32( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi32( 1UL << n ) ) )
-
-#define mm256_bitset_16( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi16( 1U << n ) )
-
-#define mm256_bitclr_16( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi16( 1U << n ) ) )
-
-// return vector of bool
-#define mm256_bittest_128( x, n ) \
-   _mm256_and_si256( _mm256_srli_si256( x, n ), \
-                     _mm256_set_m128i( _mm_set_epi64x( 0ULL, 1ULL ) ) )
+// return bit n in position, all othr bits cleared
+#define mm256_bitextract_64 ( x, n ) \
+   _mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
+#define mm256_bitextract_32 ( x, n ) \
+   _mm256_and_si128(  _mm256_set1_epi32( 0UL << (n) ), x )
+#define mm256_bitextract_16 ( x, n ) \
+   _mm256_and_si128(  _mm256_set1_epi16( 0U << (n) ), x )
 
+// Return bit n as bool (bit 0)
 #define mm256_bittest_64( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n ), \
-                     _mm256_set1_epi64x( 1ULL << n ) )
-
+   _mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) )
 #define mm256_bittest_32( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi32( x, n ), \
-                     _mm256_set1_epi32( 1UL << n ) )
-
+   _mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) )
 #define mm256_bittest_16( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi16( x, n ), \
-                     _mm256_set1_epi16( 1U << n ) )
+   _mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) )
+
+// Return x with bit n set/cleared in all elements
+#define mm256_bitset_64( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+#define mm256_bitclr_64( x, n ) \
+    _mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+#define mm256_bitset_32( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
+#define mm256_bitclr_32( x, n ) \
+    _mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
+#define mm256_bitset_16( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
+#define mm256_bitclr_16( x, n ) \
+    _mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
+
+// Return x with bit n toggled
+#define mm256_bitflip_64( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
+#define mm256_bitflip_32( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
+#define mm256_bitflip_16( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
+
 
 //
 // Memory functions
@@ -368,6 +399,14 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
    for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }
 
+// Compare data in memory, return true if different
+inline bool memcmp_256( __m256i src1, __m256i src2, int n )
+{
+   for ( int i = 0; i < n; i++ )
+     if ( src1[i] != src2[i] ) return true;
+   return false;
+}
+
 //
 // Pointer casting
 
@@ -383,39 +422,128 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 // returns p[i]
 #define casti_m256i(p,i) (((__m256i*)(p))[(i)])
 
+//
+// Bit rotations
+
 //
 // Rotate bits in vector elements
+// w = packed data, c = number of bits to rotate
 
-// Rotate bits in 64 bit elements
-// w = packed 64 bit data, c = number of bits to rotate
 #define  mm256_rotr_64( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
-
+    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) )
 #define  mm256_rotl_64( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
-
-// Rotate bits in 32 bit elements
+    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) )
 #define  mm256_rotr_32( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
-
+    _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) )
 #define  mm256_rotl_32( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
+    _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) )
+#define  mm256_rotr_16( w, c ) \
+    _mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) )
+#define  mm256_rotl_16( w, c ) \
+    _mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) )
 
+//
 // Rotate elements in vector
+// There is no full vector permute for elements less than 64 bits or 256 bit
+// shift, a little more work is needed.
 
-// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
-// Identical functionality but "f" is AVX and "x" iis AVX2, likely faster.
-#define mm256_swap_128( w )      _mm256_permute2x128_si256( w, w, 1 )
-//#define mm256_swap_128( w )      _mm256_permute2f128_si256( w, w, 1 )
+// Optimized 64 bit permutations
+// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
+#define mm256_swap_128( w )      _mm256_permute4x64_epi64( w, 0x4e )
+//#define mm256_swap_128( w )      _mm256_permute2x128_si256( w, w, 1 )
 
-// Rotate vector by one 64 bit element (aka two 32 bit elements)
-//__m256i mm256_rotl256_1x64( _mm256i, int )
+// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
 #define mm256_rotl256_1x64( w )  _mm256_permute4x64_epi64( w, 0x93 )
 #define mm256_rotr256_1x64( w )  _mm256_permute4x64_epi64( w, 0x39 )
 
-// Rotate by one 32 bit element (aka two 16 bit elements)
-#define mm256_rotl256_1x32( w )  _mm256_shuffle_epi32( w, 0x93 )
-#define mm256_rotr256_1x32( w )  _mm256_shuffle_epi32( w, 0x39 )
+// Swap hi/lo 64 bits in each 128 bit element
+#define mm256_swap128_64( x )    _mm256_shuffle_epi32( x, 0x4e )
+
+// Rotate 128 bit elements by 32 bits
+#define mm256_rotr128_1x32( x )  _mm256_shuffle_epi32( x, 0x39 )
+#define mm256_rotl128_1x32( x )  _mm256_shuffle_epi32( x, 0x93 )
+
+// Swap hi/lo 32 bits in each 64 bit element
+#define mm256_swap64_32( x )     _mm256_shuffle_epi32( x, 0xb1 )
+
+// Less efficient but more versatile. Use only for rotations that are not 
+// integrals of 64 bits. Use permutations above when possible.
+
+// Rotate 256 bit vector by c bytes.
+#define mm256_rotr256_x8( w, c ) \
+   _mm256_or_si256( _mm256_srli_si256( w, c ), \
+                     mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) )
+#define mm256_rotl256_x8( w, c ) \
+   _mm256_or_si256( _mm256_slli_si256( w, c ), \
+                     mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) )
+
+// Rotate 256 bit vector by c elements, use only for odd value rotations
+#define mm256_rotr256_x32( w, c )   mm256_rotr256_x8( w, (c)>>2 ) 
+#define mm256_rotl256_x32( w, c )   mm256_rotl256_x8( w, (c)>>2 )
+#define mm256_rotr256_x16( w, c )   mm256_rotr256_x8( w, (c)>>1 ) 
+#define mm256_rotl256_x16( w, c )   mm256_rotl256_x8( w, (c)>>1 )
+
+//
+// Rotate two 256 bit vectors as one 512 bit vector
+
+// Fast but limited to 128 bit granularity
+#define mm256_swap512_256(a, b)    _mm256_permute2x128_si256( a, b, 0x1032 )
+#define mm256_rotr512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x0321 )
+#define mm256_rotl512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x2103 )
+
+// Much slower, for 64 and 32 bit granularity
+#define mm256_rotr512_1x64(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \
+   b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_1x64(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \
+   b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotr512_1x32(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \
+   b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_1x32(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \
+   b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \
+   a = t; \
+while (0);              
+
+// Byte granularity but even a bit slower
+#define mm256_rotr512_x8( a, b, n ) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \
+                        _mm256_slli_epi64( b, ( 32 - (n) ) ) ); \
+   b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \
+                        _mm256_slli_epi64( a, ( 32 - (n) ) ) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_x8( a, b, n ) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \
+                        _mm256_srli_epi64( b, ( 32 - (n) ) ) ); \
+   b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \
+                        _mm256_srli_epi64( a, ( 32 - (n) ) ) ); \
+   a = t; \
+while (0);              
 
 //
 // Swap bytes in vector elements
@@ -438,47 +566,30 @@ inline __m256i  mm256_byteswap_32( __m256i x )
                            0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }
 
-// older, slower
-inline __m256i  mm256_byteswap_32x( __m256i x )
+inline __m256i mm256_byteswap_16( __m256i x )
 {
-  __m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
-  __m256i x2 = _mm256_and_si256( x, _mm256_set1_epi32( 0x00ff0000 ) );
-  __m256i x0 = _mm256_slli_epi32( x, 24 );   // x0 = x << 24
-          x1 = _mm256_slli_epi32( x1, 8 );   // x1 = mask1(x) << 8
-          x2 = _mm256_srli_epi32( x2, 8 );   // x2 = mask2(x) >> 8
-  __m256i x3 = _mm256_srli_epi32( x, 24 );   // x3 = x >> 24
-  return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
-                          _mm256_or_si256( x2, x3 ) );
-}
-
-inline __m256i mm256_byteswap_64x( __m256i x )
-{
-  x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
-
-  x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
-                       _mm256_slli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
-
-  return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
-                          _mm256_slli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
+  return _mm256_shuffle_epi8( x, _mm256_set_epi8(
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
 }
 
 
 // Pack/Unpack two 128 bit vectors into/from one 256 bit vector
 // usefulness tbd
+// __m128i hi, __m128i lo, returns __m256i
 #define mm256_pack_2x128( hi, lo ) \
    _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \
 
+// __m128i hi, __m128i lo, __m256i src 
 #define mm256_unpack_2x128( hi, lo, src ) \
    lo = _mm256_castsi256_si128( src ); \
-   hi = _mm256_castsi256_si128( mm256_swap_128( src ) ); 
+   hi = _mm256_castsi256_si128( mm256_swap_128( src ) );
+//   hi = _mm256_extracti128_si256( src, 1 ); 
 
 // Pseudo parallel AES
 // Probably noticeably slower than using pure 128 bit vectors
-// More efficient if one key for both lanes.
 inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
 {
     __m128i hi, lo, khi, klo;
@@ -487,7 +598,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
     mm256_unpack_2x128( khi, klo, k );
     lo = _mm_aesenc_si128( lo, klo );
     hi = _mm_aesenc_si128( hi, khi );
-
     return mm256_pack_2x128( hi, lo );
 }
 
@@ -498,7 +608,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
     mm256_unpack_2x128( hi, lo, x );
     lo = _mm_aesenc_si128( lo, mm_zero );
     hi = _mm_aesenc_si128( hi, mm_zero );
-
     return mm256_pack_2x128( hi, lo );
 }
 
@@ -533,8 +642,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
 
 // interleave 4 arrays of 32 bit elements for 128 bit processing
 // bit_len must be 256, 512 or 640 bits.
-// Vector indexing doesn't work with 32 bit data.
-// There's no vector indexing here!!!
 inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
                              const void *src2, const void *src3, int bit_len )
 {
@@ -591,8 +698,6 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
    }
 }
 
-// doesn't work with 32 bit elements
-// no vector indexing here?
 inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
                                   void *dst3, const void *src, int bit_len )
 {
@@ -632,7 +737,6 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
    d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
 }
 
-
 // deinterleave 4 arrays into individual buffers for scalarm processing
 // bit_len must be multiple of 32
 inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
@@ -656,7 +760,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
 #if defined (__AVX2__)
 
 // Interleave 4 source buffers containing 64 bit data into the destination
-// buffer
+// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
 inline void mm256_interleave_4x64( void *dst, const void *src0,
             const void *src1, const void *src2, const void *src3, int bit_len )
 {
@@ -682,6 +786,17 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
 
    d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
    d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
+   d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
+
+   d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
+   d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
+   d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
+   d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
+   // bit_len == 1024
 }
 
 // Slower version
@@ -705,7 +820,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
 }
 
 // Deinterleave 4 buffers of 64 bit data from the source buffer.
-// bit_len must be  256, 512 or 640 bits.
+// bit_len must be 256, 512, 640 or 1024 bits.
 // Requires overrun padding for 640 bit len.
 inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
                                      void *dst3, const void *src, int bit_len )
@@ -730,11 +845,26 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
 
    if ( bit_len <= 512 ) return;
 
-   // null change to overrun area
-   d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
-   d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
-   d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
-   d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
+   if ( bit_len <= 640 )
+   {
+      // null change to overrun area
+      d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
+      d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
+      d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
+      d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
+      return;
+   }
+
+   d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
+   d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
+   d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
+   d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
+
+   d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
+   d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
+   d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
+   d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
+   // bit_len == 1024
 }
 
 // Slower version
@@ -785,9 +915,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
                              s3[4], s2[4], s1[4], s0[4] );
    d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
                              s3[5], s2[5], s1[5], s0[5] );
-   d [6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
+   d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
                              s3[6], s2[6], s1[6], s0[6] );
-   d [7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
+   d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
                              s3[7], s2[7], s1[7], s0[7] );
 
    if ( bit_len <= 256 ) return;
@@ -904,22 +1034,22 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
    d = ((uint32_t*)d1) + 8;
    d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[153], s[145], s[137], s[129] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d2) + 8;
    d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[154], s[146], s[138], s[130]);
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d3) + 8;
    d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[155], s[147], s[139], s[131] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d4) + 8;
    d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[156], s[148], s[140], s[132] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d5) + 8;
    d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[157], s[149], s[141], s[133] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d6) + 8;
    d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[158], s[150], s[142], s[134] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d7) + 8;
    d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                                s[159], s[151], s[143], s[135] );
 }
diff --git a/build-4way.sh b/build-4way.sh
index fda0c32..35dabfa 100755
--- a/build-4way.sh
+++ b/build-4way.sh
@@ -1,10 +1,5 @@
 #!/bin/bash
 
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
 # Linux build
 
 make distclean || echo clean
@@ -12,14 +7,8 @@ make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-CFLAGS="-O3 -march=native -Wall -DFOUR_WAY"  ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
 
 make -j 4
 
diff --git a/build-allarch.sh b/build-allarch.sh
index 296247c..eb1c16e 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,7 +3,7 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-4way.exe
@@ -13,7 +13,7 @@ mv cpuminer cpuminer-4way
 make clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -23,7 +23,7 @@ mv cpuminer cpuminer-aes-avx2
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
@@ -33,7 +33,7 @@ mv cpuminer cpuminer-aes-avx
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -43,7 +43,7 @@ mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
@@ -53,7 +53,7 @@ mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe
diff --git a/configure b/configure
index 6795280..08d4cf4 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.7'
-PACKAGE_STRING='cpuminer-opt 3.7.7'
+PACKAGE_VERSION='3.7.8'
+PACKAGE_STRING='cpuminer-opt 3.7.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1392,7 +1392,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
    esac
   cat <<\_ACEOF
 
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.7.7
+cpuminer-opt configure 3.7.8
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.7.7, which was
+It was created by cpuminer-opt $as_me 3.7.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.7.7'
+ VERSION='3.7.8'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.7, which was
+This file was extended by cpuminer-opt $as_me 3.7.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.7
+cpuminer-opt config.status 3.7.8
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index a6c2c10..7dd0487 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.7])
+AC_INIT([cpuminer-opt], [3.7.8])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/miner.h b/miner.h
index ddf9e11..8fc29f3 100644
--- a/miner.h
+++ b/miner.h
@@ -358,8 +358,8 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
-        uint32_t nonces[4];
-        bool     nfound[4];
+        uint32_t nonces[8];
+        bool     nfound[8];
 };
 
 struct stratum_job {
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
new file mode 100755
index 0000000..762d6ee
--- /dev/null
+++ b/winbuild-cross.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+LOCAL_LIB="$HOME/usr/lib"
+
+export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+
+F="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+
+sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
+
+mkdir release
+cp README.txt release/
+cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
+cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libstdc++-6.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libgcc_s_seh-1.dll release/
+cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-4way-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
+make
+mv cpuminer.exe release/cpuminer-4way.exe
+
+CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
+make
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx2.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=znver1 -Wall" ./configure $F
+#make -j 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-aes-sha.exe
+
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse2.exe
+make clean || echo clean
+