v3.7.8

2025-09-17 23:44:27 +00:00 · 2017-12-30 19:19:46 -05:00
parent 79164c24b5
commit 2d2e54f001
66 changed files with 4321 additions and 1475 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -110,7 +109,7 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2z330.c \
  algo/lyra2/lyra2h.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
+  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
@@ -159,16 +158,36 @@ cpuminer_SOURCES = \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
-  algo/x11/phi1612.c \
+  algo/x11/c11-4way.c \
+  algo/x11/x11evo.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
  algo/yescrypt/sha256_Y.c\
  algo/yescrypt/yescrypt-simd.c\
--- a/README.md
+++ b/README.md
@@ -96,13 +96,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.

 Errata
 ------
--- a/README.txt
+++ b/README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

-Exe name                  Compile opts         Arch name
+Exe name                Compile flags              Arch name

-cpuminer-sse2.exe         -march=core2         Core2   
-cpuminer-sse42.exe        -march=corei7        Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
-cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"
+cpuminer-sse2.exe      "-march=core2"              Core2   
+cpuminer-sse42.exe     "-march=corei7"             Nehalem
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
+cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
+cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha

 4way requires a CPU with AES and AVX2. It is still under development and
 only a few algos are supported. See change log in RELEASE_NOTES in source
 package for supported algos.

-There is no binary support available for SHA on AMD Ryzen CPUs.
+Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
+is provided. Four way still uses AVX2. 
+
--- a/11
+++ b/11
@@ -27,8 +27,9 @@ Compile Instructions

 Requirements:

-Intel Core2 or newer, or AMD Steamroller or newer CPU.
-64 bit Linux or Windows operating system.
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.

 Building on linux prerequisites:

@@ -164,6 +165,10 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
 v3.7.7

 Fixed regression caused by 64 CPU support.
@@ -182,7 +187,7 @@ New algo keccakc for Creative coin with 4way optimizations
 Rewrote some AVX/AVX2 code for more consistent implementation and some
 optimizing.

-Enhanced capabilities check to support 4way, mor eprecise reporting of
+Enhanced capabilities check to support 4way, more precise reporting of
 features (not all algos use SSE2), and better error messages when using
 an incompatible pre-built version (Windows users).

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -849,9 +849,9 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm_set1_epi32( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm_set1_epi32( salt[i] );
 	sc->T0 = sc->T1 = 0;
 	sc->ptr = 0;
 }
@@ -941,10 +941,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 //       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                    _mm_set_epi32( 0x010000000, 0x01000000,
-                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+                                        _mm_set1_epi32( 0x010000000 ) );
+       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
@@ -955,10 +954,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	sc->T1 = SPH_C32(0xFFFFFFFF);
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
-                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+           u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
+        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define NIST5_4WAY
 #endif

--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -267,9 +267,6 @@ c512(sph_shavite_big_context *sc, const void *msg)

 #else

-/*
- * This function assumes that "msg" is aligned for 32-bit access.
- */
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
@@ -379,36 +376,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, mm_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, mm_zero );
      p0 = _mm_xor_si128( p0, x );
@@ -461,36 +458,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );

      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, mm_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );

      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );

      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, mm_zero );
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
+    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
-    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -10,8 +10,14 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
 static __thread jh512_4way_context ctx_mid;
-
+/*
+void init_tribus_4way_ctx()
+{
+     init_echo( &tribus_4way_ctx, 512 );
+}
+*/
 void tribus_hash_4way(void *state, const void *input)
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -1,22 +1,11 @@
 #include "tribus-gate.h"
-/*
-bool tribus_thread_init()
-{
-   sph_jh512_init( &tribus_ctx.jh );
-   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
-   init_echo( &tribus_ctx.echo, 512 );
-#endif
-  return true;
-}
-*/
+
 bool register_tribus_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
+//  init_tribus_4way_ctx();
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
@@ -4,12 +4,14 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define TRIBUS_4WAY
 #endif

 #if defined(TRIBUS_4WAY)

+//void init_tribus_4way_ctx();
+
 void tribus_hash_4way( void *state, const void *input );

 int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -4,6 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
 {
 #if defined (WHIRLPOOL_4WAY)
  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
  gate->hash      = (void*)&whirlpool_hash_4way;
 #else
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -4,9 +4,11 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+/*
 #if defined(FOUR_WAY) && defined(__AVX2__)
  #define WHIRLPOOL_4WAY
 #endif
+*/

 #if defined (WHIRLPOOL_4WAY) 

--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,10 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
-#define BYTE(x, n) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+//#define BYTE(x, n) \
+//   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+

 // A very complex, but structured, expression with a mix of scalar
 // and vector operations to retrieve specific 64 bit constants from
@@ -3357,23 +3359,51 @@ do { \
 // Extract 64 bit vector elements from "in" representing offsets. Unmask the
 // low byte of each and scale for use as vector indexes.
 // Pack the data in a vector and return it.
+
+/*
 #define t_row( inv, row ) \
   _mm256_and_si256( \
        _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
-
-// Extract vector element from "lane" of vector "in[row]" and use it to index
-// scalar array of constants "table" and return referenced 64 bit entry.
-#define t_lane( table, inv, row, lane ) \
-   table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
-//   table[ t_rwo( inv, row )[ lane ] ];
-
+*/

 // Build a vector from elements of non-contiguous 64 bit data extracted from
 // scalar "table".
+// reference scalar version 1480 kH/s
+/*
+// version 1, extract with gather
+// 955 kH/s
+#define t_lane( inv, row, lane ) \
+    BYTE( _mm256_extract_epi64( inv, lane ), row ) \
+
+
 #define t_vec( table, inv, row ) \
-    _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
-                t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
-                t_lane( table, inv, row, 0 ) )
+   _mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
+                              t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
+                              t_lane( inv, row, 0) ), 1 )
+*/
+/*
+// version 2, extract with set
+// 1100 kH/s 
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+*/
+
+// version 3, vector indexing with set
+// 1105 kH/s
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( inv[ lane ], row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+
+
 
 #if SPH_SMALL_FOOTPRINT_WHIRLPOOL

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -0,0 +1,261 @@
+#include "cpuminer-config.h"
+#include "c11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} c11_4way_ctx_holder;
+
+c11_4way_ctx_holder c11_4way_ctx;
+
+void init_c11_4way_ctx()
+{
+     blake512_4way_init( &c11_4way_ctx.blake );
+     sph_bmw512_init( &c11_4way_ctx.bmw );
+     init_groestl( &c11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &c11_4way_ctx.skein );
+     jh512_4way_init( &c11_4way_ctx.jh );
+     keccak512_4way_init( &c11_4way_ctx.keccak );
+     init_luffa( &c11_4way_ctx.luffa, 512 );
+     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_4way_ctx.shavite );
+     init_sd( &c11_4way_ctx.simd, 512 );
+     init_echo( &c11_4way_ctx.echo, 512 );
+}
+
+void c11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     c11_4way_ctx_holder ctx;
+     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            c11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -0,0 +1,18 @@
+#include "c11-gate.h"
+
+bool register_c11_algo( algo_gate_t* gate )
+{
+#if defined (C11_4WAY)
+  init_c11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_4way;
+  gate->hash      = (void*)&c11_4way_hash;
+#else
+  init_c11_ctx();
+  gate->scanhash  = (void*)&scanhash_c11;
+  gate->hash      = (void*)&c11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -0,0 +1,32 @@
+#ifndef C11_GATE_H__
+#define C11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define C11_4WAY
+#endif
+
+bool register_c11_algo( algo_gate_t* gate );
+
+#if defined(C11_4WAY)
+
+void c11_4way_hash( void *state, const void *input );
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_c11_4way_ctx();
+
+#endif
+
+void c11_hash( void *state, const void *input );
+
+int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_c11_ctx();
+
+#endif
+
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "c11-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
 #endif
 }

-void c11hash( void *output, const void *input )
+void c11_hash( void *output, const void *input )
 {
        unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
 //	uint32_t _ALIGN(64) hash[16];
@@ -157,7 +157,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	do
        {
 		be32enc( &endiandata[19], nonce );
-		c11hash( hash, endiandata );
+		c11_hash( hash, endiandata );
 		if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
                {
 			pdata[19] = nonce;
@@ -171,13 +171,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_c11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_c11_ctx();
-  gate->scanhash  = (void*)&scanhash_c11;
-  gate->hash      = (void*)&c11hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -0,0 +1,261 @@
+#include "cpuminer-config.h"
+#include "x11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11_4way_ctx_holder;
+
+x11_4way_ctx_holder x11_4way_ctx;
+
+void init_x11_4way_ctx()
+{
+     blake512_4way_init( &x11_4way_ctx.blake );
+     sph_bmw512_init( &x11_4way_ctx.bmw );
+     init_groestl( &x11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11_4way_ctx.skein );
+     jh512_4way_init( &x11_4way_ctx.jh );
+     keccak512_4way_init( &x11_4way_ctx.keccak );
+     init_luffa( &x11_4way_ctx.luffa, 512 );
+     cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_4way_ctx.shavite );
+     init_sd( &x11_4way_ctx.simd, 512 );
+     init_echo( &x11_4way_ctx.echo, 512 );
+}
+
+void x11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -5,13 +5,13 @@ bool register_x11_algo( algo_gate_t* gate )
 #if defined (X11_4WAY)
  init_x11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11_4way;
-  gate->hash      = (void*)&x11_hash_4way;
+  gate->hash      = (void*)&x11_4way_hash;
 #else
  init_x11_ctx();
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,19 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
-//  #define X11_4WAY
-//#endif
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11_4WAY
+#endif

 bool register_x11_algo( algo_gate_t* gate );

 #if defined(X11_4WAY)

-void x11_hash_4way( void *state, const void *input );
+void x11_4way_hash( void *state, const void *input );

 int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );

+void init_x11_4way_ctx();
+
 #endif

 void x11_hash( void *state, const void *input );
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -0,0 +1,268 @@
+#include "cpuminer-config.h"
+#include "x11gost-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    sph_gost512_context     gost;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11gost_4way_ctx_holder;
+
+x11gost_4way_ctx_holder x11gost_4way_ctx;
+
+void init_x11gost_4way_ctx()
+{
+     blake512_4way_init( &x11gost_4way_ctx.blake );
+     sph_bmw512_init( &x11gost_4way_ctx.bmw );
+     init_groestl( &x11gost_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11gost_4way_ctx.skein );
+     jh512_4way_init( &x11gost_4way_ctx.jh );
+     keccak512_4way_init( &x11gost_4way_ctx.keccak );
+     sph_gost512_init( &x11gost_4way_ctx.gost );
+     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_4way_ctx.shavite );
+     init_sd( &x11gost_4way_ctx.simd, 512 );
+     init_echo( &x11gost_4way_ctx.echo, 512 );
+}
+
+void x11gost_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11gost_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
+
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11gost_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -0,0 +1,18 @@
+#include "x11gost-gate.h"
+
+bool register_x11gost_algo( algo_gate_t* gate )
+{
+#if defined (X11GOST_4WAY)
+  init_x11gost_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_4way;
+  gate->hash      = (void*)&x11gost_4way_hash;
+#else
+  init_x11gost_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost;
+  gate->hash      = (void*)&x11gost_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -0,0 +1,32 @@
+#ifndef X11GOST_GATE_H__
+#define X11GOST_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11GOST_4WAY
+#endif
+
+bool register_x11gost_algo( algo_gate_t* gate );
+
+#if defined(X11GOST_4WAY)
+
+void x11gost_4way_hash( void *state, const void *input );
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11gost_4way_ctx();
+
+#endif
+
+void x11gost_hash( void *state, const void *input );
+
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11gost_ctx();
+
+#endif
+
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x11gost-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
     hashState_echo          echo;
     hashState_groestl       groestl;
 #endif
-} sib_ctx_holder;
+} x11gost_ctx_holder;

-sib_ctx_holder sib_ctx;
+x11gost_ctx_holder x11gost_ctx;

-void init_sib_ctx()
+void init_x11gost_ctx()
 {
-     sph_gost512_init(&sib_ctx.gost);
-     sph_shavite512_init(&sib_ctx.shavite);
-     init_luffa( &sib_ctx.luffa, 512 );
-     cubehashInit( &sib_ctx.cube, 512, 16, 32 );
-     init_sd( &sib_ctx.simd, 512 );
+     sph_gost512_init( &x11gost_ctx.gost );
+     sph_shavite512_init( &x11gost_ctx.shavite );
+     init_luffa( &x11gost_ctx.luffa, 512 );
+     cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
+     init_sd( &x11gost_ctx.simd, 512 );
 #ifdef NO_AES_NI
-     sph_groestl512_init( &sib_ctx.groestl );
-     sph_echo512_init( &sib_ctx.echo );
+     sph_groestl512_init( &x11gost_ctx.groestl );
+     sph_echo512_init( &x11gost_ctx.echo );
 #else
-     init_echo( &sib_ctx.echo, 512 );
-     init_groestl( &sib_ctx.groestl, 64 );
+     init_echo( &x11gost_ctx.echo, 512 );
+     init_groestl( &x11gost_ctx.groestl, 64 );
 #endif

 }

-void sibhash(void *output, const void *input)
+void x11gost_hash(void *output, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (64)));
     #define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
     sph_u64 hashctA;
     sph_u64 hashctB;

-     sib_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
+     x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );

     DECL_BLK;
     BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
     memcpy(output, hashA, 32);
 }

-int scanhash_sib(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -156,7 +156,7 @@ int scanhash_sib(int thr_id, struct work *work,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		sibhash(hash, endiandata);
+		x11gost_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -172,12 +172,3 @@ int scanhash_sib(int thr_id, struct work *work,
 	return 0;
 }

-bool register_sib_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_sib_ctx();
-    gate->scanhash = (void*)&scanhash_sib;
-    gate->hash     = (void*)&sibhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -0,0 +1,186 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    cubehashParam           cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_4way_ctx_holder;
+
+phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_4way_ctx()
+{
+     skein512_4way_init( &phi1612_4way_ctx.skein );
+     jh512_4way_init( &phi1612_4way_ctx.jh );
+     cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_4way_ctx.fugue );
+     sph_gost512_init( &phi1612_4way_ctx.gost );
+     init_echo( &phi1612_4way_ctx.echo, 512 );
+};
+
+void phi1612_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     phi1612_4way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
+
+     // Skein parallel 4way
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t _ALIGN(64) endiandata[20];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+
+     for ( int k = 0; k < 19; k++ )
+        be32enc( &endiandata[k], pdata[k] );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do {
+        found[0] = found[1] = found[2] = found[3] = false;
+        be32enc( noncep0, n   );
+        be32enc( noncep1, n+1 );
+        be32enc( noncep2, n+2 );
+        be32enc( noncep3, n+3 );
+
+        phi1612_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+        }
+        if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+        {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+        }
+        if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+        {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+        }
+        if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+        {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+        }
+        n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+               && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -0,0 +1,18 @@
+#include "phi1612-gate.h"
+
+bool register_phi1612_algo( algo_gate_t* gate )
+{
+#if defined(PHI1612_4WAY)
+  init_phi1612_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_4way;
+  gate->hash      = (void*)&phi1612_4way_hash;
+#else
+  init_phi1612_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612;
+  gate->hash      = (void*)&phi1612_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -0,0 +1,32 @@
+#ifndef PHI1612_GATE_H__
+#define PHI1612_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define PHI1612_4WAY
+#endif
+
+bool register_phi1612_algo( algo_gate_t* gate );
+
+#if defined(PHI1612_4WAY)
+
+void phi1612_4way_hash( void *state, const void *input );
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done );
+
+void init_phi1612_4way_ctx();
+
+#endif
+
+void phi1612_hash( void *state, const void *input );
+
+int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_phi1612_ctx();
+
+#endif
+
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "phi1612-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -33,7 +33,7 @@ phi_ctx_holder phi_ctx;
 static __thread sph_skein512_context phi_skein_mid
                                           __attribute__ ((aligned (64)));

-void init_phi_ctx()
+void init_phi1612_ctx()
 {
     sph_skein512_init( &phi_ctx.skein );
     sph_jh512_init( &phi_ctx.jh );
@@ -53,7 +53,7 @@ void phi_skein_midstate( const void* input )
    sph_skein512( &phi_skein_mid, input, 64 );
 }

-void phi1612hash(void *output, const void *input)
+void phi1612_hash(void *output, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (64)));
     phi_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -112,7 +112,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		phi1612hash(hash, endiandata);
+		phi1612_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -128,12 +128,3 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_phi1612_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_phi_ctx();
-    gate->scanhash = (void*)&scanhash_phi1612;
-    gate->hash     = (void*)&phi1612hash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -0,0 +1,293 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13_4way_ctx_holder;
+
+x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x13_4way_ctx()
+{
+     blake512_4way_init( &x13_4way_ctx.blake );
+     sph_bmw512_init( &x13_4way_ctx.bmw );
+     init_groestl( &x13_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13_4way_ctx.skein );
+     jh512_4way_init( &x13_4way_ctx.jh );
+     keccak512_4way_init( &x13_4way_ctx.keccak );
+     init_luffa( &x13_4way_ctx.luffa, 512 );
+     cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_4way_ctx.shavite );
+     init_sd( &x13_4way_ctx.simd, 512 );
+     init_echo( &x13_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x13_4way_ctx.hamsi );
+     sph_fugue512_init( &x13_4way_ctx.fugue );
+};
+
+void x13_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -0,0 +1,18 @@
+#include "x13-gate.h"
+
+bool register_x13_algo( algo_gate_t* gate )
+{
+#if defined (X13_4WAY)
+  init_x13_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_4way;
+  gate->hash      = (void*)&x13_4way_hash;
+#else
+  init_x13_ctx();
+  gate->scanhash  = (void*)&scanhash_x13;
+  gate->hash      = (void*)&x13hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13_GATE_H__
+#define X13_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13_4WAY
+#endif
+
+bool register_x13_algo( algo_gate_t* gate );
+
+#if defined(X13_4WAY)
+
+void x13_4way_hash( void *state, const void *input );
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13_4way_ctx();
+
+#endif
+
+void x13hash( void *state, const void *input );
+
+int scanhash_x13( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13_ctx();
+
+#endif
+
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -68,7 +68,7 @@ void init_x13_ctx()
        sph_fugue512_init( &x13_ctx.fugue );
 };

-static void x13hash(void *output, const void *input)
+void x13hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -249,15 +249,3 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
  pdata[19] = n;
  return 0;
 }
-
-
-bool register_x13_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x13_ctx();
-  gate->scanhash = (void*)&scanhash_x13;
-  gate->hash     = (void*)&x13hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -0,0 +1,328 @@
+#include "x13sm3-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/sm3/sph_sm3.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sm3_ctx_t               sm3;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13sm3_4way_ctx_holder;
+
+x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context x13sm3_ctx_mid;
+
+void init_x13sm3_4way_ctx()
+{
+     blake512_4way_init( &x13sm3_4way_ctx.blake );
+     sph_bmw512_init( &x13sm3_4way_ctx.bmw );
+     init_groestl( &x13sm3_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13sm3_4way_ctx.skein );
+     jh512_4way_init( &x13sm3_4way_ctx.jh );
+     keccak512_4way_init( &x13sm3_4way_ctx.keccak );
+     init_luffa( &x13sm3_4way_ctx.luffa, 512 );
+     cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
+     init_sd( &x13sm3_4way_ctx.simd, 512 );
+     init_echo( &x13sm3_4way_ctx.echo, 512 );
+     sm3_init( &x13sm3_4way_ctx.sm3 );
+     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
+     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
+};
+
+void x13sm3_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13sm3_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
+     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+
+//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // SM3
+     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash0, 0, sizeof sm3_hash0 );
+     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash1, 0, sizeof sm3_hash1 );
+     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash2, 0, sizeof sm3_hash2 );
+     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash3, 0, sizeof sm3_hash3 );
+
+     sph_sm3( &ctx.sm3, hash0, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash0 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash1, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash1 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash2, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash2 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash3, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash3 );
+
+     // Hamsi
+     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     blake512_4way_init( &x13sm3_ctx_mid );
+     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13sm3_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -0,0 +1,18 @@
+#include "x13sm3-gate.h"
+
+bool register_x13sm3_algo( algo_gate_t* gate )
+{
+#if defined (X13SM3_4WAY)
+  init_x13sm3_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3_4way;
+  gate->hash      = (void*)&x13sm3_4way_hash;
+#else
+  init_x13sm3_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3;
+  gate->hash      = (void*)&x13sm3_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13SM3_GATE_H__
+#define X13SM3_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13SM3_4WAY
+#endif
+
+bool register_x13sm3_algo( algo_gate_t* gate );
+
+#if defined(X13SM3_4WAY)
+
+void x13sm3_4way_hash( void *state, const void *input );
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13sm3_4way_ctx();
+
+#endif
+
+void x13sm3_hash( void *state, const void *input );
+
+int scanhash_x13sm3( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13sm3_ctx();
+
+#endif
+
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13sm3-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -49,7 +49,7 @@ typedef struct {

 hsr_ctx_holder hsr_ctx;

-void init_hsr_ctx()
+void init_x13sm3_ctx()
 {
 #ifdef NO_AES_NI
        sph_groestl512_init(&hsr_ctx.groestl);
@@ -67,7 +67,7 @@ void init_hsr_ctx()
        sph_fugue512_init(&hsr_ctx.fugue);
 };

-static void x13sm3hash(void *output, const void *input)
+void x13sm3_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));

@@ -213,7 +213,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x13sm3hash(hash64, endiandata);
+				x13sm3_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
@@ -240,13 +240,3 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 	return 0;
 }

-bool register_x13sm3_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_hsr_ctx();
-  gate->scanhash  = (void*)&scanhash_x13sm3;
-  gate->hash      = (void*)&x13sm3hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -0,0 +1,310 @@
+#include "x14-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+} x14_4way_ctx_holder;
+
+x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_4way_ctx()
+{
+     blake512_4way_init( &x14_4way_ctx.blake );
+     sph_bmw512_init( &x14_4way_ctx.bmw );
+     init_groestl( &x14_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x14_4way_ctx.skein );
+     jh512_4way_init( &x14_4way_ctx.jh );
+     keccak512_4way_init( &x14_4way_ctx.keccak );
+     init_luffa( &x14_4way_ctx.luffa, 512 );
+     cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_4way_ctx.shavite );
+     init_sd( &x14_4way_ctx.simd, 512 );
+     init_echo( &x14_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x14_4way_ctx.hamsi );
+     sph_fugue512_init( &x14_4way_ctx.fugue );
+     sph_shabal512_init( &x14_4way_ctx.shabal );
+};
+
+void x14_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x14_4way_ctx_holder ctx;
+     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x14_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -0,0 +1,18 @@
+#include "x14-gate.h"
+
+bool register_x14_algo( algo_gate_t* gate )
+{
+#if defined (X14_4WAY)
+  init_x14_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_4way;
+  gate->hash      = (void*)&x14_4way_hash;
+#else
+  init_x14_ctx();
+  gate->scanhash  = (void*)&scanhash_x14;
+  gate->hash      = (void*)&x14hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -0,0 +1,32 @@
+#ifndef X14_GATE_H__
+#define X14_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X14_4WAY
+#endif
+
+bool register_x14_algo( algo_gate_t* gate );
+
+#if defined(X14_4WAY)
+
+void x14_4way_hash( void *state, const void *input );
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x14_4way_ctx();
+
+#endif
+
+void x14hash( void *state, const void *input );
+
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x14_ctx();
+
+#endif
+
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x14-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -72,7 +72,7 @@ void init_x14_ctx()
        sph_shabal512_init(&x14_ctx.shabal);
 };

-static void x14hash(void *output, const void *input)
+void x14hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -248,14 +248,3 @@ int scanhash_x14(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x14_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x14_ctx();
-  gate->scanhash  = (void*)&scanhash_x14;
-  gate->hash      = (void*)&x14hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -0,0 +1,329 @@
+#include "x15-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_4way_ctx_holder;
+
+x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_4way_ctx()
+{
+     blake512_4way_init( &x15_4way_ctx.blake );
+     sph_bmw512_init( &x15_4way_ctx.bmw );
+     init_groestl( &x15_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x15_4way_ctx.skein );
+     jh512_4way_init( &x15_4way_ctx.jh );
+     keccak512_4way_init( &x15_4way_ctx.keccak );
+     init_luffa( &x15_4way_ctx.luffa, 512 );
+     cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_4way_ctx.shavite );
+     init_sd( &x15_4way_ctx.simd, 512 );
+     init_echo( &x15_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x15_4way_ctx.hamsi );
+     sph_fugue512_init( &x15_4way_ctx.fugue );
+     sph_shabal512_init( &x15_4way_ctx.shabal );
+     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
+};
+
+void x15_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x15_4way_ctx_holder ctx;
+     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x15_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -0,0 +1,17 @@
+#include "x15-gate.h"
+
+bool register_x15_algo( algo_gate_t* gate )
+{
+#if defined (X15_4WAY)
+  init_x15_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_4way;
+  gate->hash      = (void*)&x15_4way_hash;
+#else
+  init_x15_ctx();
+  gate->scanhash  = (void*)&scanhash_x15;
+  gate->hash      = (void*)&x15hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -0,0 +1,32 @@
+#ifndef X15_GATE_H__
+#define X15_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X15_4WAY
+#endif
+
+bool register_x15_algo( algo_gate_t* gate );
+
+#if defined(X15_4WAY)
+
+void x15_4way_hash( void *state, const void *input );
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x15_4way_ctx();
+
+#endif
+
+void x15hash( void *state, const void *input );
+
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x15_ctx();
+
+#endif
+
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x15-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -74,7 +74,7 @@ void init_x15_ctx()
        sph_whirlpool_init( &x15_ctx.whirlpool );
 };

-static void x15hash(void *output, const void *input)
+void x15hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -260,13 +260,3 @@ int scanhash_x15(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x15_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x15_ctx();
-  gate->scanhash = (void*)&scanhash_x15;
-  gate->hash     = (void*)&x15hash;
-  return true;
-};
-
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -0,0 +1,364 @@
+#include "x17-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+    blake512_4way_context   blake;
+    sph_bmw512_context      bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    sph_shabal512_context   shabal;
+    sph_whirlpool_context   whirlpool;
+    SHA512_CTX              sha512;
+    sph_haval256_5_context  haval;
+} x17_4way_ctx_holder;
+
+x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x17_4way_ctx()
+{
+     blake512_4way_init( &x17_4way_ctx.blake );
+     sph_bmw512_init( &x17_4way_ctx.bmw );
+     init_groestl( &x17_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x17_4way_ctx.skein );
+     jh512_4way_init( &x17_4way_ctx.jh );
+     keccak512_4way_init( &x17_4way_ctx.keccak );
+     init_luffa( &x17_4way_ctx.luffa, 512 );
+     cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x17_4way_ctx.shavite );
+     init_sd( &x17_4way_ctx.simd, 512 );
+     init_echo( &x17_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x17_4way_ctx.hamsi );
+     sph_fugue512_init( &x17_4way_ctx.fugue );
+     sph_shabal512_init( &x17_4way_ctx.shabal );
+     sph_whirlpool_init( &x17_4way_ctx.whirlpool );
+     SHA512_Init( &x17_4way_ctx.sha512 );
+     sph_haval256_5_init( &x17_4way_ctx.haval );
+};
+
+void x17_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x17_4way_ctx_holder ctx;
+     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 2 Bmw
+     sph_bmw512( &ctx.bmw, hash0, 64 );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, 64 );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, 64 );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, 64 );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal
+     sph_shabal512( &ctx.shabal, hash0, 64 );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, 64 );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, 64 );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, 64 );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     // 16 SHA512 
+     SHA512_Update( &ctx.sha512, hash0, 64 );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, 64 );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, 64 );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, 64 );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     // 17 Haval
+     sph_haval256_5( &ctx.haval, (const void*)hash0, 64 );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, 64 );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, 64 );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, 64 );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x17_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -0,0 +1,17 @@
+#include "x17-gate.h"
+
+bool register_x17_algo( algo_gate_t* gate )
+{
+#if defined (X17_4WAY)
+  init_x17_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x17_4way;
+  gate->hash      = (void*)&x17_4way_hash;
+#else
+  init_x17_ctx();
+  gate->scanhash  = (void*)&scanhash_x17;
+  gate->hash      = (void*)&x17_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -0,0 +1,32 @@
+#ifndef X17_GATE_H__
+#define X17_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X17_4WAY
+#endif
+
+bool register_x17_algo( algo_gate_t* gate );
+
+#if defined(X17_4WAY)
+
+void x17_4way_hash( void *state, const void *input );
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x17_4way_ctx();
+
+#endif
+
+void x17_hash( void *state, const void *input );
+
+int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x17_ctx();
+
+#endif
+
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x17-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -86,7 +86,7 @@ void init_x17_ctx()
        sph_haval256_5_init(&x17_ctx.haval);
 };

-static void x17hash(void *output, const void *input)
+void x17_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (64)));
 	#define hashB hash+64
@@ -248,7 +248,7 @@ int scanhash_x17(int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x17hash(hash64, endiandata);
+				x17_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if (!(hash64[7] & mask))
                                {
@@ -281,7 +281,7 @@ int scanhash_x17(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_x17_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -290,4 +290,4 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->hash     = (void*)&x17hash;
  return true;
 };
-
+*/
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -0,0 +1,556 @@
+#include "xevan-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sph_sha2.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+        blake512_4way_context   blake;
+        sph_bmw512_context      bmw;
+        hashState_groestl       groestl;
+        skein512_4way_context   skein;
+        jh512_4way_context      jh;
+        keccak512_4way_context  keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        hashState_echo          echo;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_haval256_5_context  haval;
+} xevan_4way_ctx_holder;
+
+xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context xevan_blake_4way_mid
+                                        __attribute__ ((aligned (64)));
+
+void init_xevan_4way_ctx()
+{
+        blake512_4way_init(&xevan_4way_ctx.blake);
+        sph_bmw512_init(&xevan_4way_ctx.bmw);
+        init_groestl( &xevan_4way_ctx.groestl, 64 );
+        skein512_4way_init(&xevan_4way_ctx.skein);
+        jh512_4way_init(&xevan_4way_ctx.jh);
+        keccak512_4way_init(&xevan_4way_ctx.keccak);
+        init_luffa( &xevan_4way_ctx.luffa, 512 );
+        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
+        sph_shavite512_init( &xevan_4way_ctx.shavite );
+        init_sd( &xevan_4way_ctx.simd, 512 );
+        init_echo( &xevan_4way_ctx.echo, 512 );
+        sph_hamsi512_init( &xevan_4way_ctx.hamsi );
+        sph_fugue512_init( &xevan_4way_ctx.fugue );
+        sph_shabal512_init( &xevan_4way_ctx.shabal );
+        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
+        SHA512_Init( &xevan_4way_ctx.sha512 );
+        sph_haval256_5_init( &xevan_4way_ctx.haval );
+};
+
+void xevan_4way_blake512_midstate( const void* input )
+{
+    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
+            sizeof(xevan_blake_4way_mid) );
+    blake512_4way( &xevan_blake_4way_mid, input, 64 );
+}
+
+void xevan_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     const int midlen = 64;            // bytes
+     const int tail   = 80 - midlen;   // 16
+     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     memcpy( &ctx.blake, &xevan_blake_4way_mid,
+             sizeof(xevan_blake_4way_mid) );
+     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     blake512_4way_close(&ctx.blake, vhash);
+
+     memset( &vhash[8<<2], 0, 64<<2 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_bmw512( &ctx.bmw, hash0, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_shabal512( &ctx.shabal, hash0, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal, 
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     blake512_4way( &ctx.blake, vhash, dataLen );
+     blake512_4way_close(&ctx.blake, vhash);
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_bmw512( &ctx.bmw, hash0, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash0 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash1, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash1 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash2, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash2 );
+     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
+     sph_bmw512( &ctx.bmw, hash3, dataLen );
+     sph_bmw512_close( &ctx.bmw, hash3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_shabal512( &ctx.shabal, hash0, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash0 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash1, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash1 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash2, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash2 );
+     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
+             sizeof(sph_shabal512_context) );
+     sph_shabal512( &ctx.shabal, hash3, dataLen );
+     sph_shabal512_close( &ctx.shabal, hash3 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+//	uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   for ( int k=0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   xevan_4way_blake512_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      xevan_4way_hash( hash, vdata );
+
+      pdata[19] = n;
+
+      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce )
+             && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -0,0 +1,24 @@
+#include "xevan-gate.h"
+
+void xevan_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_xevan_algo( algo_gate_t* gate )
+{
+#if defined (XEVAN_4WAY)
+  init_xevan_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan_4way;
+  gate->hash      = (void*)&xevan_4way_hash;
+#else
+  init_xevan_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan;
+  gate->hash      = (void*)&xevan_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->set_target = (void*)&xevan_set_target;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -0,0 +1,32 @@
+#ifndef XEVAN_GATE_H__
+#define XEVAN_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define XEVAN_4WAY
+#endif
+
+bool register_xevan_algo( algo_gate_t* gate );
+
+#if defined(XEVAN_4WAY)
+
+void xevan_4way_hash( void *state, const void *input );
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_xevan_4way_ctx();
+
+#endif
+
+void xevan_hash( void *state, const void *input );
+
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_xevan_ctx();
+
+#endif
+
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "xevan-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -286,19 +286,3 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
 	return 0;
 }

-void xevan_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_xevan_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_xevan_ctx();
-  gate->scanhash   = (void*)&scanhash_xevan;
-  gate->hash       = (void*)&xevan_hash;
-  gate->set_target = (void*)&xevan_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};
-
--- a/algo/yescrypt/yescrypt-opt.c
+++ b/algo/yescrypt/yescrypt-opt.c
@@ -1,935 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "sha256_Y.h"
-#include "sysendian.h"
-
-#include "yescrypt-platform.h"
-
-static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ = *src++; *dest++ = *src++;
-		*dest++ = *src++; *dest++ = *src++;
-	} while (count -= 4);
-}
-
-static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ ^= *src++; *dest++ ^= *src++;
-		*dest++ ^= *src++; *dest++ ^= *src++;
-	} while (count -= 4);
-}
-
-typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
-} salsa20_blk_t;
-
-static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
-	COMBINE(0, 0, 2)
-	COMBINE(1, 5, 7)
-	COMBINE(2, 2, 4)
-	COMBINE(3, 7, 1)
-	COMBINE(4, 4, 6)
-	COMBINE(5, 1, 3)
-	COMBINE(6, 6, 0)
-	COMBINE(7, 3, 5)
-#undef COMBINE
-}
-
-static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
-	COMBINE(0, 0, 6)
-	COMBINE(1, 5, 3)
-	COMBINE(2, 2, 0)
-	COMBINE(3, 7, 5)
-	COMBINE(4, 4, 2)
-	COMBINE(5, 1, 7)
-	COMBINE(6, 6, 4)
-	COMBINE(7, 3, 1)
-#undef COMBINE
-}
-
-/**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
- */
-static void salsa20_8(uint64_t B[8])
-{
-	size_t i;
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
-
-	for (i = 0; i < 8; i += 2) {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	}
-#undef x
-
-	{
-		salsa20_blk_t Y;
-		salsa20_simd_shuffle(&X, &Y);
-		for (i = 0; i < 16; i += 4) {
-			((salsa20_blk_t *)B)->w[i] += Y.w[i];
-			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
-			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
-			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
-		}
-	}
-}
-
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
-{
-	size_t i;
-
-	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < 2 * r; i += 2) {
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4], X, 8);
-
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8 + 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4 + r * 8], X, 8);
-	}
-}
-
-/* These are tunable */
-#define S_BITS 8
-#define S_SIMD 2
-#define S_P 4
-#define S_ROUNDS 6
-
-/* Number of S-boxes.  Not tunable, hard-coded in a few places. */
-#define S_N 2
-
-/* Derived values.  Not tunable on their own. */
-#define S_SIZE1 (1 << S_BITS)
-#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
-#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
-#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
-#define S_P_SIZE (S_P * S_SIMD)
-#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
-
-/**
- * pwxform(B):
- * Transform the provided block using the provided S-boxes.
- */
-static void block_pwxform(uint64_t * B, const uint64_t * S)
-{
-	uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
-	const uint8_t *S0 = (const uint8_t *)S;
-	const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
-	size_t i, j;
-#if S_SIMD > 2
-	size_t k;
-#endif
-
-	for (j = 0; j < S_P; j++) {
-		uint64_t *Xj = X[j];
-		uint64_t x0 = Xj[0];
-#if S_SIMD > 1
-		uint64_t x1 = Xj[1];
-#endif
-
-		for (i = 0; i < S_ROUNDS; i++) {
-			uint64_t x = x0 & S_MASK2;
-			const uint64_t *p0, *p1;
-
-			p0 = (const uint64_t *)(S0 + (uint32_t)x);
-			p1 = (const uint64_t *)(S1 + (x >> 32));
-
-			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
-			x0 += p0[0];
-			x0 ^= p1[0];
-
-#if S_SIMD > 1
-			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
-			x1 += p0[1];
-			x1 ^= p1[1];
-#endif
-
-#if S_SIMD > 2
-			for (k = 2; k < S_SIMD; k++) {
-				x = Xj[k];
-
-				x = (uint64_t)(x >> 32) * (uint32_t)x;
-				x += p0[k];
-				x ^= p1[k];
-
-				Xj[k] = x;
-			}
-#endif
-		}
-
-		Xj[0] = x0;
-#if S_SIMD > 1
-		Xj[1] = x1;
-#endif
-	}
-}
-
-/**
- * blockmix_pwxform(Bin, Bout, S, r):
- * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin).  The input Bin must
- * be 128r bytes in length; the output Bout must also be the same size.
- *
- * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
- * need to refer to both functions via the same function pointers.
- */
-static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
-{
-	size_t r1, r2, i;
-
-	/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
-	r1 = r * 128 / (S_P_SIZE * 8);
-
-	/* X <-- B_{r1 - 1} */
-	blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
-
-	/* X <-- X \xor B_i */
-	blkxor(Bout, Bin, S_P_SIZE);
-
-	/* X <-- H'(X) */
-	/* B'_i <-- X */
-	block_pwxform(Bout, S);
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 1; i < r1; i++) {
-		/* X <-- X \xor B_i */
-		blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
-		    S_P_SIZE);
-		blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
-
-		/* X <-- H'(X) */
-		/* B'_i <-- X */
-		block_pwxform(&Bout[i * S_P_SIZE], S);
-	}
-
-	/* Handle partial blocks */
-	if (i * S_P_SIZE < r * 16)
-		blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
-		    r * 16 - i * S_P_SIZE);
-
-	i = (r1 - 1) * S_P_SIZE / 8;
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r2 = r * 2;
-
-	/* B'_i <-- H(B'_i) */
-	salsa20_8(&Bout[i * 8]);
-	i++;
-
-	for (; i < r2; i++) {
-		/* B'_i <-- H(B'_i \xor B'_{i-1}) */
-		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
-		salsa20_8(&Bout[i * 8]);
-	}
-}
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static __inline uint64_t
-integerify(const uint64_t * B, size_t r)
-{
-/*
- * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
- * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
- * in host byte order, as it should be.
- */
-	const uint64_t * X = &B[(2 * r - 1) * 8];
-	uint32_t lo = (uint32_t) X[0];
-	uint32_t hi = (uint32_t) (X[6] >> 32);
-	return ((uint64_t)hi << 32) + lo;
-}
-
-/**
- * smix1(B, r, N, flags, V, NROM, shared, XY, S):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be even and
- * no smaller than 2.
- */
-static void
-smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1;
-	size_t s = 16 * r;
-	uint64_t * X = V;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t n, i, j;
-	size_t k;
-
-	/* 1: X <-- B */
-	/* 3: V_i <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	/* 4: X <-- H(X) */
-	/* 3: V_i <-- X */
-	blockmix(X, Y, Z, r);
-	blkcpy(&V[s], Y, s);
-
-	X = XY;
-
-	if (NROM && (VROM_mask & 1)) {
-		if ((1 & VROM_mask) == 1) {
-			/* j <-- Integerify(X) mod NROM */
-			j = integerify(Y, r) & (NROM - 1);
-
-			/* X <-- H(X \xor VROM_j) */
-			blkxor(Y, &VROM[j * s], s);
-		}
-
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if ((i & (i - 1)) == 0)
-				n <<= 1;
-
-			/* j <-- Wrap(Integerify(X), i) */
-			j = integerify(X, r) & (n - 1);
-			j += i - n;
-
-			/* X <-- X \xor V_j */
-			blkxor(X, &V[j * s], s);
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i + 1 - n;
-
-				/* X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		yescrypt_flags_t rw = flags & YESCRYPT_RW;
-
-		/* 4: X <-- H(X) */
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if (rw) {
-				if ((i & (i - 1)) == 0)
-					n <<= 1;
-
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(X, r) & (n - 1);
-				j += i - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(X, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			if (rw) {
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(Y, r) & (n - 1);
-				j += (i + 1) - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(Y, X, Z, r);
-		}
-	}
-
-	/* B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be a
- * power of 2 greater than 1.  The value Nloop must be even.
- */
-static void
-smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1 | 1;
-	size_t s = 16 * r;
-	yescrypt_flags_t rw = flags & YESCRYPT_RW;
-	uint64_t * X = XY;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t i, j;
-	size_t k;
-
-	if (Nloop == 0)
-		return;
-
-	/* X <-- B' */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	if (NROM) {
-		/* 6: for i = 0 to N - 1 do */
-		for (i = 0; i < Nloop; i += 2) {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* 7: j <-- Integerify(X) mod N */
-				j &= N - 1;
-
-				/* 8: X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-				/* V_j <-- Xprev \xor V_j */
-				if (rw)
-					blkcpy(&V[j * s], Y, s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		/* 6: for i = 0 to N - 1 do */
-		i = Nloop / 2;
-		do {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(Y, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(Y, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], Y, s);
-			blockmix(Y, X, Z, r);
-		} while (--i);
-	}
-
-	/* 10: B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * p2floor(x):
- * Largest power of 2 not greater than argument.
- */
-static uint64_t
-p2floor(uint64_t x)
-{
-	uint64_t y;
-	while ((y = x & (x - 1)))
-		x = y;
-	return x;
-}
-
-/**
- * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage
- * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
- * required with OpenMP-enabled builds).  The value N must be a power of 2
- * greater than 1.
- */
-static void
-smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	size_t s = 16 * r;
-	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
-	uint32_t i;
-
-	Nloop_all = Nchunk;
-	if (flags & YESCRYPT_RW) {
-		if (t <= 1) {
-			if (t)
-				Nloop_all *= 2; /* 2/3 */
-			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
-		} else {
-			Nloop_all *= t - 1;
-		}
-	} else if (t) {
-		if (t == 1)
-			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
-		Nloop_all *= t;
-	}
-
-	Nloop_rw = 0;
-	if (flags & __YESCRYPT_INIT_SHARED)
-		Nloop_rw = Nloop_all;
-	else if (flags & YESCRYPT_RW)
-		Nloop_rw = Nloop_all / p;
-
-	Nchunk &= ~(uint64_t)1; /* round down to even */
-	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
-	Nloop_rw &= ~(uint64_t)1; /* round down to even */
-
-#ifdef _OPENMP
-#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
-	{
-#pragma omp for
-#endif
-	for (i = 0; i < p; i++) {
-		uint64_t Vchunk = i * Nchunk;
-		uint64_t * Bp = &B[i * s];
-		uint64_t * Vp = &V[Vchunk * s];
-#ifdef _OPENMP
-		uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-		uint64_t * XYp = XY;
-#endif
-		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
-		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-		if (Sp)
-			smix1(Bp, 1, S_SIZE_ALL / 16,
-			    flags & ~YESCRYPT_PWXFORM,
-			    Sp, NROM, shared, XYp, NULL);
-		if (!(flags & __YESCRYPT_INIT_SHARED_2))
-			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
-		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
-		    NROM, shared, XYp, Sp);
-	}
-
-	if (Nloop_all > Nloop_rw) {
-#ifdef _OPENMP
-#pragma omp for
-#endif
-		for (i = 0; i < p; i++) {
-			uint64_t * Bp = &B[i * s];
-#ifdef _OPENMP
-			uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-			uint64_t * XYp = XY;
-#endif
-			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-			smix2(Bp, r, N, Nloop_all - Nloop_rw,
-			    flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
-		}
-	}
-#ifdef _OPENMP
-	}
-#endif
-}
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters r, p, and buflen must satisfy
- * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
- * of 2 greater than 1.
- *
- * t controls computation time while not affecting peak memory usage.  shared
- * and flags may request special modes as described in yescrypt.h.  local is
- * the thread-local data structure, allowing to preserve and reuse a memory
- * allocation across calls, thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- */
-int
-yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
-    const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen,
-    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_region_t tmp;
-	uint64_t NROM;
-	size_t B_size, V_size, XY_size, need;
-	uint64_t * B, * V, * XY, * S;
-	uint64_t sha256[4];
-
-	/*
-	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
-	 * so don't let it have side-effects.  Without this adjustment, it'd
-	 * enable the SHA-256 password pre-hashing and output post-hashing,
-	 * because any deviation from classic scrypt implies those.
-	 */
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	/* Sanity-check parameters */
-	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
-		errno = EINVAL;
-		return -1;
-	}
-#if SIZE_MAX > UINT32_MAX
-	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
-		errno = EFBIG;
-		return -1;
-	}
-#endif
-	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
-		errno = EFBIG;
-		return -1;
-	}
-	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-#if S_MIN_R > 1
-	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
-		errno = EINVAL;
-		return -1;
-	}
-#endif
-	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
-#if SIZE_MAX / 256 <= UINT32_MAX
-	    (r > SIZE_MAX / 256) ||
-#endif
-	    (N > SIZE_MAX / 128 / r)) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
-		errno = EFBIG;
-		return -1;
-	}
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
-	    (N > SIZE_MAX / 128 / (r * p))) {
-		errno = ENOMEM;
-		return -1;
-	}
-#endif
-	if ((flags & YESCRYPT_PWXFORM) &&
-#ifndef _OPENMP
-	    (flags & YESCRYPT_PARALLEL_SMIX) &&
-#endif
-	    p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
-		errno = ENOMEM;
-		return -1;
-	}
-
-	NROM = 0;
-	if (shared->shared1.aligned) {
-		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
-		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
-		    !(flags & YESCRYPT_RW)) {
-			errno = EINVAL;
-			return -1;
-		}
-	}
-
-	/* Allocate memory */
-	V = NULL;
-	V_size = (size_t)128 * r * N;
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX))
-		V_size *= p;
-#endif
-	need = V_size;
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (local->aligned_size < need) {
-			if (local->base || local->aligned ||
-			    local->base_size || local->aligned_size) {
-				errno = EINVAL;
-				return -1;
-			}
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		V = (uint64_t *)local->aligned;
-		need = 0;
-	}
-	B_size = (size_t)128 * r * p;
-	need += B_size;
-	if (need < B_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	XY_size = (size_t)256 * r + 64;
-#ifdef _OPENMP
-	XY_size *= p;
-#endif
-	need += XY_size;
-	if (need < XY_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (flags & YESCRYPT_PWXFORM) {
-		size_t S_size = S_SIZE_ALL * sizeof(*S);
-#ifdef _OPENMP
-		S_size *= p;
-#else
-		if (flags & YESCRYPT_PARALLEL_SMIX)
-			S_size *= p;
-#endif
-		need += S_size;
-		if (need < S_size) {
-			errno = ENOMEM;
-			return -1;
-		}
-	}
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (!alloc_region(&tmp, need))
-			return -1;
-		B = (uint64_t *)tmp.aligned;
-		XY = (uint64_t *)((uint8_t *)B + B_size);
-	} else {
-		init_region(&tmp);
-		if (local->aligned_size < need) {
-			if (free_region(local))
-				return -1;
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		B = (uint64_t *)local->aligned;
-		V = (uint64_t *)((uint8_t *)B + B_size);
-		XY = (uint64_t *)((uint8_t *)V + V_size);
-	}
-	S = NULL;
-	if (flags & YESCRYPT_PWXFORM)
-		S = (uint64_t *)((uint8_t *)XY + XY_size);
-
-	if (t || flags) {
-		SHA256_CTX_Y ctx;
-		SHA256_Init_Y(&ctx);
-		SHA256_Update_Y(&ctx, passwd, passwdlen);
-		SHA256_Final_Y((uint8_t *)sha256, &ctx);
-		passwd = (uint8_t *)sha256;
-		passwdlen = sizeof(sha256);
-	}
-
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
-	    (uint8_t *)B, B_size);
-
-	if (t || flags)
-		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
-
-	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
-		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
-	} else {
-		uint32_t i;
-
-		/* 2: for i = 0 to p - 1 do */
-#ifdef _OPENMP
-#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
-#endif
-		for (i = 0; i < p; i++) {
-			/* 3: B_i <-- MF(B_i, N) */
-#ifdef _OPENMP
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
-			    &V[(size_t)16 * r * i * N],
-			    NROM, shared,
-			    &XY[((size_t)32 * r + 8) * i],
-			    S ? &S[S_SIZE_ALL * i] : S);
-#else
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
-			    NROM, shared, XY, S);
-#endif
-		}
-	}
-
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
-
-	/*
-	 * Except when computing classic scrypt, allow all computation so far
-	 * to be performed on the client.  The final steps below match those of
-	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
-	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
-	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
-	 */
-	if ((t || flags) && buflen == sizeof(sha256)) {
-		/* Compute ClientKey */
-		{
-			HMAC_SHA256_CTX ctx;
-			HMAC_SHA256_Init(&ctx, buf, buflen);
-			HMAC_SHA256_Update(&ctx, salt, saltlen);
-			HMAC_SHA256_Final((uint8_t *)sha256, &ctx);
-		}
-		/* Compute StoredKey */
-		{
-			SHA256_CTX_Y ctx;
-			SHA256_Init_Y(&ctx);
-			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
-			SHA256_Final_Y(buf, &ctx);
-		}
-	}
-
-	if (free_region(&tmp))
-		return -1;
-
-	/* Success! */
-	return 0;
-}
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -426,7 +426,7 @@ int64_t yescryptr16_get_max64()

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )

 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,71 +1,96 @@
 #ifndef AVXDEFS_H__
 #define AVXDEFS_H__

-// Some tools to help using AVX and AVX2
-// At this time SSE2 is sufficient for all 128 bit code in this file.
+// Some tools to help using AVX and AVX2.
+// At this time SSE2 is sufficient for all 128 bit code in this file
+// but could change without notice.
 // 256 bit requires AVX2.
+// AVX512 has more powerful 256 bit instructions but with AVX512 available
+// there is little reason to use them.
+// Proper alignment of data is required, 16 bytes for 128 bit vectors and
+// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
+// best cache alignment.
+//
+// There exist dupplicates of some functions. In general the first defined
+// is preferred as it is more efficient but also more restrictive and may
+// not be applicable. The less efficient versions are more flexible.

 #include <inttypes.h>
 #include <immintrin.h>
 #include <memory.h>
+#include <stdbool.h>

 //
 // 128 bit utilities and shortcuts

+//
+// Pseudo constants, there are no real vector constants.
+// These can't be used for compile time initialization.
+
 // Constant zero
-#define mm_zero _mm_setzero_si128()
+#define mm_zero      _mm_setzero_si128()
+
+// Constant 1
+#define mm_one_128   _mm_set_epi64x(  0ULL, 1ULL )
+#define mm_one_64    _mm_set1_epi64x( 1ULL )
+#define mm_one_32    _mm_set1_epi32(  1UL )
+#define mm_one_16    _mm_set1_epi16(  1U )

 // Constant minus 1
-#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFF )
+#define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFUL )
+
+//
+// Basic operations without equivalent SIMD intrinsic

 // Bitwise not (~x)
-#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 ) 
+#define mm_not( x )  _mm_xor_si128( (x), mm_neg1 ) 

 // Unary negation (-a)
 #define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a )
 #define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )  
+#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a )  

 //
-// Bit operations, functional but not very efficient
+// Bit operations

-// Return x with bit n set/clear in all elements
-#define mm_bitset_128( x, n ) \
-   _mm_or_si128( _mm_slli_si128( _mm_set_epi64x( 0ULL, 1ULL ), n ) )
-
-#define mm_bitclr_128( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_si128( \
-                                     _mm_set_epi64x( 0ULL, 1ULL ), n ) ) )
-
-#define mm_bitset_64( x, n ) \
-   _mm_or_si128( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) )
-
-#define mm_bitclr_64( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) ) )
-
-#define mm_bitset_32( x, n ) \
-   _mm_or_si128( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) )
-
-#define mm_bitclr_32( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) ) )
-
-#define mm_bitset_16( x, n ) \
-   _mm_or_si128( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) )
-
-#define mm_bitclr_16( x, n ) \
-   _mm_and_si128( x, mm_not( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) ) )
-
-// return vector of bool
-#define mm_bittest_128( x, n ) \
-   _mm_and_si256( _mm_srli_si128( x, n ), _mm_set_epi64x( 0ULL, 1ULL ) ) 
+// Return bit n in position, all other bits zeroed.
+#define mm_bitextract_64 ( x, n ) \
+   _mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
+#define mm_bitextract_32 ( x, n ) \
+   _mm_and_si128(  _mm_set1_epi32( 1UL << (n) ), x )
+#define mm_bitextract_16 ( x, n ) \
+   _mm_and_si128(  _mm_set1_epi16( 1U << (n) ), x )

+// Return bit n as bool
 #define mm_bittest_64( x, n ) \
-   _mm_and_si256( _mm_srli_epi64( x, n ), _mm_set1_epi64x( 1ULL ) ) 
-
+   _mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) ) 
 #define mm_bittest_32( x, n ) \
-   _mm_and_si256( _mm_srli_epi32( x, n ), _mm_set1_epi32( 1UL ) ) 
-
+   _mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) ) 
 #define mm_bittest_16( x, n ) \
-   _mm_and_si256( _mm_srli_epi16( x, n ), _mm_set1_epi16( 1U ) ) 
+   _mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) ) 
+
+// Return x with bit n set/cleared in all elements
+#define mm_bitset_64( x, n ) \
+   _mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitclr_64( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitset_32( x, n ) \
+   _mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitclr_32( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitset_16( x, n ) \
+   _mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x )
+#define mm_bitclr_16( x, n ) \
+   _mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x )
+
+// Return x with bit n toggled
+#define mm_bitflip_64( x, n ) \
+   _mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x )
+#define mm_bitflip_32( x, n ) \
+   _mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x )
+#define mm_bitflip_16( x, n ) \
+   _mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x )
+

 //
 // Memory functions
@@ -86,13 +111,33 @@ inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
   for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }

-// Scalar 64 bit copy, n = bytes/8
-inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
+// Compare data in memory, return true if different
+inline bool memcmp_128( __m128i src1, __m128i src2, int n )
 {
   for ( int i = 0; i < n; i++ )
-       dst[i] = src[i];
+     if ( src1[i] != src2[i] ) return true;
+   return false;
 }

+// A couple of 64 bit scalar functions
+// n = bytes/8
+
+inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = src[i];
+}
+
+inline void memset_zero_64( uint64_t *src, int n )
+{
+   for ( int i = 0; i < n; i++ ) src[i] = 0;
+}
+
+inline void memset_64( uint64_t *dst, uint64_t a,  int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = a;
+}
+
+
 //
 // Pointer cast

@@ -108,149 +153,136 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
 // returns p[i]
 #define casti_m128i(p,i) (((__m128i*)(p))[(i)])

+//
+// Bit rotations
+
+// XOP is an obsolete AMD feature that has native rotation. 
+//    _mm_roti_epi64( w, c)
+// Never implemented by Intel and since removed from Zen by AMD.

 // Rotate bits in vector elements
-
 #define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
-                                         _mm_slli_epi64( w, 64-c ) )
-
+                                         _mm_slli_epi64( w, 64-(c) ) )
 #define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
-                                         _mm_srli_epi64( w, 64-c ) )
-
+                                         _mm_srli_epi64( w, 64-(c) ) )
 #define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
-                                         _mm_slli_epi32( w, 32-c ) )
-
+                                         _mm_slli_epi32( w, 32-(c) ) )
 #define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
-                                         _mm_srli_epi32( w, 32-c ) )
-
+                                         _mm_srli_epi32( w, 32-(c) ) )
 #define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \
-                                         _mm_slli_epi16( w, 16-c ) )
-
+                                         _mm_slli_epi16( w, 16-(c) ) )
 #define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \
-                                         _mm_srli_epi16( w, 16-c ) )
+                                         _mm_srli_epi16( w, 16-(c) ) )

 //
-// Shuffle vector elements
+// Rotate elements in vector

-// Swap upper and lower 64 bits of 128 bit source vector
-#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
+// Optimized shuffle

-// Rotate 128 vector by 1 32 bit element.
+// Swap hi/lo 64 bits in 128 bit vector
+#define mm_swap_64( w )    _mm_shuffle_epi32( w, 0x4e )
+
+// rotate 128 bit vector by 32 bits
 #define mm_rotr_1x32( w )  _mm_shuffle_epi32( w, 0x39 )
 #define mm_rotl_1x32( w )  _mm_shuffle_epi32( w, 0x93 )

-// Shuffle elements across two 128 bit vectors
+// Swap hi/lo 32 bits in each 64 bit element
+#define mm_swap64_32( x )  _mm_shuffle_epi32( x, 0xb1 )

-// Swap 128 bit source vectors in place.
+// Less efficient but more versatile. Use only for odd number rotations.
+// Use shuffle above when possible.
+
+// Rotate vector by n bytes.
+#define mm_rotr128_x8( w, n ) \
+     _mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) )
+#define mm_rotl128_x8( w, n ) \
+     _mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) )
+
+// Rotate vector by c elements, use only for odd number rotations
+#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 ) 
+#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 )
+#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 ) 
+#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 )
+
+//
+// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo}
+
+// Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits.
 // void mm128_swap128( __m128i, __m128i )
-#define mm_swap_128(hi, lo) hi = _mm_xor_si128(hi, lo); \
-                            lo = _mm_xor_si128(hi, lo); \
-                            hi = _mm_xor_si128(hi, lo);
-
-// Rotate two 128 bit vectors in place as one 256 vector by 1 element
-#define mm_rotl256_1x64( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x64( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotl256_1x32( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x32( s0, s1 ) \
-do { \
- __m128i t; \
- s0 = mm_swap_64( s0 ); \
- s1 = mm_swap_64( s1 ); \
- t  = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- s0 = t; \
-} while(0)
-
-// Older slower
-#define mm_rotl256_1x64x( s0, s1 ) \
-do { \
-   __m128i t; \
-   s0 = mm_swap_64( s0 ); \
-   s1 = mm_swap_64( s1 ); \
-   t = _mm_or_si128( \
-           _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-           _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s1 = _mm_or_si128( \
-           _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-           _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s0 = t; \
-} while(0)
-
-#define mm_rotr256_1x64x( s0, s1 ) \
-do { \
-   __m128i t; \
-   s0 = mm_swap_64( s0 ) ; \
-   s1 = mm_swap_64( s1 ); \
-   t = _mm_or_si128( \
-          _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-          _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s1 = _mm_or_si128( \
-          _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-          _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s0 = t; \
-} while(0)
-
-// need a better name, not rot, poke? step?
-// Return s0 with elements shifted right/left and low/high element from
-// s1 shifted into the vacated high/low element of s0.
-// Partially rotate elements in two 128 bit vectors as one 256 bit vector
-// and return the rotated s0.
-// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
-// completed. It's faster than a full rotation.
-
-inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
-{
-   return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
-                        _mm_slli_si128( s1, 16 - (n<<2) ) );
+#define mm_swap_128(hi, lo) \
+{ \
+   hi = _mm_xor_si128(hi, lo); \
+   lo = _mm_xor_si128(hi, lo); \
+   hi = _mm_xor_si128(hi, lo); \
 }

-inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
+// Rotate two 128 bit vectors in place as one 256 vector by 1 element
+#define mm_rotl256_1x64( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotr256_1x64( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotl256_1x32( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                          0ul,          0ul,          0ul, 0xfffffffful )); \
+ hi = t; \
+} while(0)
+
+#define mm_rotr256_1x32( hi, lo ) \
+do { \
+ __m128i t; \
+ hi = mm_swap_64( hi ); \
+ lo = mm_swap_64( lo ); \
+ t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                          0ul,          0ul,          0ul, 0xfffffffful )); \
+ lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
+                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
+ hi = t; \
+} while(0)
+
+// Return hi 128 bits with elements shifted one lane with vacated lane filled
+// with data rotated from lo.
+// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// and return the rotated high 128 bits.
+// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
+// completed. It's faster than a full rotation.
+
+inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
 {
-   return _mm_or_si128( _mm_slli_si128( s0, n<<2 ), 
-                        _mm_srli_si128( s1, 16 - (n<<2) ) );
+   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
+                        _mm_slli_si128( lo, 16 - (n<<2) ) );
+}
+
+inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n )
+{
+   return _mm_or_si128( _mm_slli_si128( hi, n<<2 ), 
+                        _mm_srli_si128( lo, 16 - (n<<2) ) );
 }

 //
 // Swap bytes in vector elements

-inline __m128i mm_byteswap_32( __m128i x )
-{
-  return _mm_shuffle_epi8( x, _mm_set_epi8(
-                           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
-                           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
-}
-
 inline __m128i mm_byteswap_64( __m128i x )
 {
  return _mm_shuffle_epi8( x, _mm_set_epi8(
@@ -258,96 +290,95 @@ inline __m128i mm_byteswap_64( __m128i x )
                           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
 }

-// older slower
-inline __m128i  mm_byteswap_32x( __m128i x )
+inline __m128i mm_byteswap_32( __m128i x )
 {
-  __m128i x1 = _mm_and_si128( x, _mm_set1_epi32( 0x0000ff00 ) );
-  __m128i x2 = _mm_and_si128( x, _mm_set1_epi32( 0x00ff0000 ) );
-  __m128i x0 = _mm_slli_epi32( x, 24 );   // x0 = x << 24
-          x1 = _mm_slli_epi32( x1, 8 );   // x1 = mask(x) << 8
-          x2 = _mm_srli_epi32( x2, 8 );   // x2 = mask(x) >> 8
-  __m128i x3 = _mm_srli_epi32( x, 24 );   // x3 = x >> 24
-  return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
+  return _mm_shuffle_epi8( x, _mm_set_epi8(
+                           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
+                           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }

-inline __m128i mm_byteswap_64x( __m128i x )
+inline __m128i mm_byteswap_16( __m128i x )
 {
-  x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
-
-  x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
-                             _mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
-                          _mm_slli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
-
-   return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
-                          _mm_slli_epi64( _mm_and_si128( x,
-                              _mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
+  return _mm_shuffle_epi8( x, _mm_set_epi8(
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
 }

+/////////////////////////////////////////////////////////////////////
+
 #if defined (__AVX2__)

 //
 // 256 bit utilities and Shortcuts

+//
+// Pseudo constants, there are no real vector constants.
+// These can't be used for compile time initialization
+
 // Constant zero
 #define mm256_zero _mm256_setzero_si256()

+// Constant 1
+#define mm256_one_128        _mm256_set_epi64x(  0ULL, 1ULL, 0ULL, 1ULL )
+#define mm256_one_64         _mm256_set1_epi64x( 1ULL )
+#define mm256_one_32         _mm256_set1_epi32(  1UL )
+#define mm256_one_16         _mm256_set1_epi16(  1U )
+
 // Constant minus 1
-#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFF )
+#define mm256_neg1           _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
+
+//
+// Basic operations without SIMD equivalent

 // Bitwise not ( ~x )
-#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
+#define mm256_not( x )       _mm256_xor_si256( (x), mm256_neg1 ) \

 // Unary negation ( -a )
 #define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a )
 #define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )  
+#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a )  

 //
 // Bit operations

-// Return x with bit n set/clear in all elements
-#define mm256_bitset_128( x, n ) \
-   _mm256_or_si256( _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) )
-
-#define mm256_bitclr_128( x, n ) \
-   _mm256_and_si256( x, mm256_not( \
-                        _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) ) )
- 
-#define mm256_bitset_64( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi64x( 1ULL << n ) )
-
-#define mm256_bitclr_64( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi64x( 1ULL << n ) ) )
-
-#define mm256_bitset_32( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi32( 1UL << n ) )
-
-#define mm256_bitclr_32( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi32( 1UL << n ) ) )
-
-#define mm256_bitset_16( x, n ) \
-    _mm256_or_si256( x, _mm256_set1_epi16( 1U << n ) )
-
-#define mm256_bitclr_16( x, n ) \
-    _mm256_and_si256( x, mm256_not( _mm256_set1_epi16( 1U << n ) ) )
-
-// return vector of bool
-#define mm256_bittest_128( x, n ) \
-   _mm256_and_si256( _mm256_srli_si256( x, n ), \
-                     _mm256_set_m128i( _mm_set_epi64x( 0ULL, 1ULL ) ) )
+// return bit n in position, all othr bits cleared
+#define mm256_bitextract_64 ( x, n ) \
+   _mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
+#define mm256_bitextract_32 ( x, n ) \
+   _mm256_and_si128(  _mm256_set1_epi32( 0UL << (n) ), x )
+#define mm256_bitextract_16 ( x, n ) \
+   _mm256_and_si128(  _mm256_set1_epi16( 0U << (n) ), x )

+// Return bit n as bool (bit 0)
 #define mm256_bittest_64( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n ), \
-                     _mm256_set1_epi64x( 1ULL << n ) )
-
+   _mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) )
 #define mm256_bittest_32( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi32( x, n ), \
-                     _mm256_set1_epi32( 1UL << n ) )
-
+   _mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) )
 #define mm256_bittest_16( x, n ) \
-   _mm256_and_si256( _mm256_srli_epi16( x, n ), \
-                     _mm256_set1_epi16( 1U << n ) )
+   _mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) )
+
+// Return x with bit n set/cleared in all elements
+#define mm256_bitset_64( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+#define mm256_bitclr_64( x, n ) \
+    _mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+#define mm256_bitset_32( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
+#define mm256_bitclr_32( x, n ) \
+    _mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
+#define mm256_bitset_16( x, n ) \
+    _mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
+#define mm256_bitclr_16( x, n ) \
+    _mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
+
+// Return x with bit n toggled
+#define mm256_bitflip_64( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
+#define mm256_bitflip_32( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
+#define mm256_bitflip_16( x, n ) \
+   _mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
+

 //
 // Memory functions
@@ -368,6 +399,14 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
   for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }

+// Compare data in memory, return true if different
+inline bool memcmp_256( __m256i src1, __m256i src2, int n )
+{
+   for ( int i = 0; i < n; i++ )
+     if ( src1[i] != src2[i] ) return true;
+   return false;
+}
+
 //
 // Pointer casting

@@ -383,39 +422,128 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 // returns p[i]
 #define casti_m256i(p,i) (((__m256i*)(p))[(i)])

+//
+// Bit rotations
+
 //
 // Rotate bits in vector elements
+// w = packed data, c = number of bits to rotate

-// Rotate bits in 64 bit elements
-// w = packed 64 bit data, c = number of bits to rotate
 #define  mm256_rotr_64( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
-
+    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) )
 #define  mm256_rotl_64( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
-
-// Rotate bits in 32 bit elements
+    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) )
 #define  mm256_rotr_32( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
-
+    _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) )
 #define  mm256_rotl_32( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
+    _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) )
+#define  mm256_rotr_16( w, c ) \
+    _mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) )
+#define  mm256_rotl_16( w, c ) \
+    _mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) )

+//
 // Rotate elements in vector
+// There is no full vector permute for elements less than 64 bits or 256 bit
+// shift, a little more work is needed.

-// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
-// Identical functionality but "f" is AVX and "x" iis AVX2, likely faster.
-#define mm256_swap_128( w )      _mm256_permute2x128_si256( w, w, 1 )
-//#define mm256_swap_128( w )      _mm256_permute2f128_si256( w, w, 1 )
+// Optimized 64 bit permutations
+// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
+#define mm256_swap_128( w )      _mm256_permute4x64_epi64( w, 0x4e )
+//#define mm256_swap_128( w )      _mm256_permute2x128_si256( w, w, 1 )

-// Rotate vector by one 64 bit element (aka two 32 bit elements)
-//__m256i mm256_rotl256_1x64( _mm256i, int )
+// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
 #define mm256_rotl256_1x64( w )  _mm256_permute4x64_epi64( w, 0x93 )
 #define mm256_rotr256_1x64( w )  _mm256_permute4x64_epi64( w, 0x39 )

-// Rotate by one 32 bit element (aka two 16 bit elements)
-#define mm256_rotl256_1x32( w )  _mm256_shuffle_epi32( w, 0x93 )
-#define mm256_rotr256_1x32( w )  _mm256_shuffle_epi32( w, 0x39 )
+// Swap hi/lo 64 bits in each 128 bit element
+#define mm256_swap128_64( x )    _mm256_shuffle_epi32( x, 0x4e )
+
+// Rotate 128 bit elements by 32 bits
+#define mm256_rotr128_1x32( x )  _mm256_shuffle_epi32( x, 0x39 )
+#define mm256_rotl128_1x32( x )  _mm256_shuffle_epi32( x, 0x93 )
+
+// Swap hi/lo 32 bits in each 64 bit element
+#define mm256_swap64_32( x )     _mm256_shuffle_epi32( x, 0xb1 )
+
+// Less efficient but more versatile. Use only for rotations that are not 
+// integrals of 64 bits. Use permutations above when possible.
+
+// Rotate 256 bit vector by c bytes.
+#define mm256_rotr256_x8( w, c ) \
+   _mm256_or_si256( _mm256_srli_si256( w, c ), \
+                     mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) )
+#define mm256_rotl256_x8( w, c ) \
+   _mm256_or_si256( _mm256_slli_si256( w, c ), \
+                     mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) )
+
+// Rotate 256 bit vector by c elements, use only for odd value rotations
+#define mm256_rotr256_x32( w, c )   mm256_rotr256_x8( w, (c)>>2 ) 
+#define mm256_rotl256_x32( w, c )   mm256_rotl256_x8( w, (c)>>2 )
+#define mm256_rotr256_x16( w, c )   mm256_rotr256_x8( w, (c)>>1 ) 
+#define mm256_rotl256_x16( w, c )   mm256_rotl256_x8( w, (c)>>1 )
+
+//
+// Rotate two 256 bit vectors as one 512 bit vector
+
+// Fast but limited to 128 bit granularity
+#define mm256_swap512_256(a, b)    _mm256_permute2x128_si256( a, b, 0x1032 )
+#define mm256_rotr512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x0321 )
+#define mm256_rotl512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x2103 )
+
+// Much slower, for 64 and 32 bit granularity
+#define mm256_rotr512_1x64(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \
+   b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_1x64(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \
+   b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotr512_1x32(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \
+   b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_1x32(a, b) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \
+   b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \
+   a = t; \
+while (0);              
+
+// Byte granularity but even a bit slower
+#define mm256_rotr512_x8( a, b, n ) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \
+                        _mm256_slli_epi64( b, ( 32 - (n) ) ) ); \
+   b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \
+                        _mm256_slli_epi64( a, ( 32 - (n) ) ) ); \
+   a = t; \
+while (0);              
+
+#define mm256_rotl512_x8( a, b, n ) \
+do { \
+   __m256i t; \
+   t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \
+                        _mm256_srli_epi64( b, ( 32 - (n) ) ) ); \
+   b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \
+                        _mm256_srli_epi64( a, ( 32 - (n) ) ) ); \
+   a = t; \
+while (0);              

 //
 // Swap bytes in vector elements
@@ -438,47 +566,30 @@ inline __m256i  mm256_byteswap_32( __m256i x )
                           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }

-// older, slower
-inline __m256i  mm256_byteswap_32x( __m256i x )
+inline __m256i mm256_byteswap_16( __m256i x )
 {
-  __m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
-  __m256i x2 = _mm256_and_si256( x, _mm256_set1_epi32( 0x00ff0000 ) );
-  __m256i x0 = _mm256_slli_epi32( x, 24 );   // x0 = x << 24
-          x1 = _mm256_slli_epi32( x1, 8 );   // x1 = mask1(x) << 8
-          x2 = _mm256_srli_epi32( x2, 8 );   // x2 = mask2(x) >> 8
-  __m256i x3 = _mm256_srli_epi32( x, 24 );   // x3 = x >> 24
-  return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
-                          _mm256_or_si256( x2, x3 ) );
-}
-
-inline __m256i mm256_byteswap_64x( __m256i x )
-{
-  x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
-
-  x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
-                       _mm256_slli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
-
-  return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
-                          _mm256_slli_epi64( _mm256_and_si256( x,
-                             _mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
+  return _mm256_shuffle_epi8( x, _mm256_set_epi8(
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
+                           0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
+                           0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
 }


 // Pack/Unpack two 128 bit vectors into/from one 256 bit vector
 // usefulness tbd
+// __m128i hi, __m128i lo, returns __m256i
 #define mm256_pack_2x128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \

+// __m128i hi, __m128i lo, __m256i src 
 #define mm256_unpack_2x128( hi, lo, src ) \
   lo = _mm256_castsi256_si128( src ); \
   hi = _mm256_castsi256_si128( mm256_swap_128( src ) );
+//   hi = _mm256_extracti128_si256( src, 1 ); 

 // Pseudo parallel AES
 // Probably noticeably slower than using pure 128 bit vectors
-// More efficient if one key for both lanes.
 inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
 {
    __m128i hi, lo, khi, klo;
@@ -487,7 +598,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
    mm256_unpack_2x128( khi, klo, k );
    lo = _mm_aesenc_si128( lo, klo );
    hi = _mm_aesenc_si128( hi, khi );
-
    return mm256_pack_2x128( hi, lo );
 }

@@ -498,7 +608,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
    mm256_unpack_2x128( hi, lo, x );
    lo = _mm_aesenc_si128( lo, mm_zero );
    hi = _mm_aesenc_si128( hi, mm_zero );
-
    return mm256_pack_2x128( hi, lo );
 }

@@ -533,8 +642,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )

 // interleave 4 arrays of 32 bit elements for 128 bit processing
 // bit_len must be 256, 512 or 640 bits.
-// Vector indexing doesn't work with 32 bit data.
-// There's no vector indexing here!!!
 inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
                             const void *src2, const void *src3, int bit_len )
 {
@@ -591,8 +698,6 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
   }
 }

-// doesn't work with 32 bit elements
-// no vector indexing here?
 inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
                                  void *dst3, const void *src, int bit_len )
 {
@@ -632,7 +737,6 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
   d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
 }

-
 // deinterleave 4 arrays into individual buffers for scalarm processing
 // bit_len must be multiple of 32
 inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
@@ -656,7 +760,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
 #if defined (__AVX2__)

 // Interleave 4 source buffers containing 64 bit data into the destination
-// buffer
+// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
 inline void mm256_interleave_4x64( void *dst, const void *src0,
            const void *src1, const void *src2, const void *src3, int bit_len )
 {
@@ -682,6 +786,17 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,

   d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
   d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
+   d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
+
+   d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
+   d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
+   d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
+   d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
+   // bit_len == 1024
 }

 // Slower version
@@ -705,7 +820,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
 }

 // Deinterleave 4 buffers of 64 bit data from the source buffer.
-// bit_len must be  256, 512 or 640 bits.
+// bit_len must be 256, 512, 640 or 1024 bits.
 // Requires overrun padding for 640 bit len.
 inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
                                     void *dst3, const void *src, int bit_len )
@@ -730,11 +845,26 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,

   if ( bit_len <= 512 ) return;

-   // null change to overrun area
-   d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
-   d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
-   d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
-   d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
+   if ( bit_len <= 640 )
+   {
+      // null change to overrun area
+      d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
+      d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
+      d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
+      d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
+      return;
+   }
+
+   d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
+   d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
+   d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
+   d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
+
+   d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
+   d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
+   d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
+   d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
+   // bit_len == 1024
 }

 // Slower version
@@ -785,9 +915,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
                             s3[4], s2[4], s1[4], s0[4] );
   d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
                             s3[5], s2[5], s1[5], s0[5] );
-   d [6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
+   d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
                             s3[6], s2[6], s1[6], s0[6] );
-   d [7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
+   d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
                             s3[7], s2[7], s1[7], s0[7] );

   if ( bit_len <= 256 ) return;
@@ -904,22 +1034,22 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
   d = ((uint32_t*)d1) + 8;
   d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[153], s[145], s[137], s[129] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d2) + 8;
   d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[154], s[146], s[138], s[130]);
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d3) + 8;
   d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[155], s[147], s[139], s[131] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d4) + 8;
   d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[156], s[148], s[140], s[132] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d5) + 8;
   d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[157], s[149], s[141], s[133] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d6) + 8;
   d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[158], s[150], s[142], s[134] );
-   d = ((uint32_t*)d1) + 8;
+   d = ((uint32_t*)d7) + 8;
   d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
                               s[159], s[151], s[143], s[135] );
 }
--- a/build-4way.sh
+++ b/build-4way.sh
@@ -1,10 +1,5 @@
 #!/bin/bash

-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
 # Linux build

 make distclean || echo clean
@@ -12,14 +7,8 @@ make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done

-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-CFLAGS="-O3 -march=native -Wall -DFOUR_WAY"  ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl

 make -j 4

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,7 +3,7 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-4way.exe
@@ -13,7 +13,7 @@ mv cpuminer cpuminer-4way
 make clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -23,7 +23,7 @@ mv cpuminer cpuminer-aes-avx2
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
@@ -33,7 +33,7 @@ mv cpuminer cpuminer-aes-avx
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -43,7 +43,7 @@ mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
@@ -53,7 +53,7 @@ mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.7'
-PACKAGE_STRING='cpuminer-opt 3.7.7'
+PACKAGE_VERSION='3.7.8'
+PACKAGE_STRING='cpuminer-opt 3.7.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.7
+cpuminer-opt configure 3.7.8
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.7.7, which was
+It was created by cpuminer-opt $as_me 3.7.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.7'
+ VERSION='3.7.8'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.7, which was
+This file was extended by cpuminer-opt $as_me 3.7.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.7
+cpuminer-opt config.status 3.7.8
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.7])
+AC_INIT([cpuminer-opt], [3.7.8])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -358,8 +358,8 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
-        uint32_t nonces[4];
-        bool     nfound[4];
+        uint32_t nonces[8];
+        bool     nfound[8];
 };

 struct stratum_job {
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+LOCAL_LIB="$HOME/usr/lib"
+
+export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+
+F="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+
+sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
+
+mkdir release
+cp README.txt release/
+cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
+cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libstdc++-6.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libgcc_s_seh-1.dll release/
+cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-4way-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
+make
+mv cpuminer.exe release/cpuminer-4way.exe
+
+CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
+make
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx2.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=znver1 -Wall" ./configure $F
+#make -j 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-aes-sha.exe
+
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse2.exe
+make clean || echo clean
+