v3.7.9

v3.7.8
2025-09-17 23:44:27 +00:00 · 2018-01-08 22:04:43 -05:00 · 2017-12-30 19:19:46 -05:00
94 changed files with 6860 additions and 1696 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
@@ -56,6 +55,7 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -63,10 +63,8 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
-  algo/drop.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
@@ -79,7 +77,6 @@ cpuminer_SOURCES = \
  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -110,13 +107,12 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2z330.c \
  algo/lyra2/lyra2h.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
+  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
-  algo/polytimos/polytimos-gate.c \
-  algo/polytimos/polytimos.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
@@ -128,6 +124,7 @@ cpuminer_SOURCES = \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
@@ -142,15 +139,10 @@ cpuminer_SOURCES = \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
-  algo/skunk.c \
  algo/sm3/sm3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
  algo/timetravel10.c \
-  algo/tribus/tribus-gate.c \
-  algo/tribus/tribus.c \
-  algo/tribus/tribus-4way.c \
-  algo/veltor.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -159,21 +151,54 @@ cpuminer_SOURCES = \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
-  algo/x11/phi1612.c \
+  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/fresh.c \
+  algo/x11/x11evo.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Supported Algorithms
                          timetravel10 Bitcore
                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor
+                          veltor       (VLT)
                          whirlpool
                          whirlpoolx
                          x11          Dash
@@ -81,6 +81,7 @@ Supported Algorithms
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

@@ -96,13 +97,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.

 Errata
 ------
--- a/README.txt
+++ b/README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

-Exe name                  Compile opts         Arch name
+Exe name                Compile flags              Arch name

-cpuminer-sse2.exe         -march=core2         Core2   
-cpuminer-sse42.exe        -march=corei7        Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
-cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"
+cpuminer-sse2.exe      "-march=core2"              Core2   
+cpuminer-sse42.exe     "-march=corei7"             Nehalem
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
+cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
+cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha

 4way requires a CPU with AES and AVX2. It is still under development and
 only a few algos are supported. See change log in RELEASE_NOTES in source
 package for supported algos.

-There is no binary support available for SHA on AMD Ryzen CPUs.
+Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
+is provided. Four way still uses AVX2. 
+
--- a/18
+++ b/18
@@ -27,8 +27,9 @@ Compile Instructions

 Requirements:

-Intel Core2 or newer, or AMD Steamroller or newer CPU.
-64 bit Linux or Windows operating system.
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.

 Building on linux prerequisites:

@@ -164,6 +165,17 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
 v3.7.7

 Fixed regression caused by 64 CPU support.
@@ -182,7 +194,7 @@ New algo keccakc for Creative coin with 4way optimizations
 Rewrote some AVX/AVX2 code for more consistent implementation and some
 optimizing.

-Enhanced capabilities check to support 4way, mor eprecise reporting of
+Enhanced capabilities check to support 4way, more precise reporting of
 features (not all algos use SSE2), and better error messages when using
 an incompatible pre-built version (Windows users).

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
@@ -219,6 +219,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
@@ -278,6 +279,7 @@ const char* const algo_alias_map[][2] =
 {
 //   alias                proper
  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
  { "blake256r8vnl",     "vanilla"      },
  { "blake256r14",       "blake"        },
@@ -300,10 +302,9 @@ const char* const algo_alias_map[][2] =
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "yes",               "yescrypt"     },
  { "ziftr",             "zr5"          },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8",        "yescrypt"     },
+  { "yescryptr8k",       "yescrypt"     },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include <limits.h>

-//#include "sph_blake.h"
 #include "blake-hash-4way.h"

 #ifdef __cplusplus
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-/*
-  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
- 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
- 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
-  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
-  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
-  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
- 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
- 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
-  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
- 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
-*/
 #endif

 #define Z00   0
@@ -849,9 +836,9 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm_set1_epi32( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm_set1_epi32( salt[i] );
 	sc->T0 = sc->T1 = 0;
 	sc->ptr = 0;
 }
@@ -914,57 +901,49 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   unsigned z = 0x80 >> n;
-//   unsigned zz = ((ub & -z) | z) & 0xFF;
-//   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

   if ( ptr == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+	sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
 	sc->T1 = SPH_T32(sc->T1 - 1);
   } 
   else
 	sc->T0 -= 512 - bit_len;

-//   if ( ptr <= 48 )
   if ( ptr <= 52 )
   {
       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-//       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                    _mm_set_epi32( 0x010000000, 0x01000000,
-                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+                                        _mm_set1_epi32( 0x01000000UL ) );
+       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
   {
 	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
 	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
-                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
+        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
        out[k] = mm_byteswap_32( sc->H[k] );
-//        out[k] =  sc->H[k];
 }

 #if defined (__AVX2__)
@@ -977,9 +956,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm256_set1_epi64x( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm256_set1_epi64x( salt[i] );
        sc->T0 = sc->T1 = 0;
        sc->ptr = 0;
 }
@@ -1051,12 +1030,12 @@ blake64_4way_close( blake_4way_big_context *sc,
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
 	sc->T1 = SPH_T64(sc->T1 - 1);
   } 
   else
@@ -1068,10 +1047,7 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
-                                    _mm256_set_epi64x( 0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000 ) );
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1084,15 +1060,11 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000 );
-
+           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -0,0 +1,969 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include "bmw-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#include "sph_bmw.h"
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+//#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#undef SPH_ROTL64
+//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
+//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+/*
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+*/
+#if SPH_64
+
+#define sb0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 3) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 4), \
+                                       mm256_rotl_64( (x), 37) ) )
+
+#define sb1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 13), \
+                                       mm256_rotl_64( (x), 43) ) )
+
+#define sb2(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 1) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 19), \
+                                       mm256_rotl_64( (x), 53) ) )
+
+#define sb3(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 28), \
+                                       mm256_rotl_64( (x), 59) ) )
+
+#define sb4(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
+
+#define sb5(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
+
+#define rb1(x)    mm256_rotl_64( x,  5 ) 
+#define rb2(x)    mm256_rotl_64( x, 11 ) 
+#define rb3(x)    mm256_rotl_64( x, 27 ) 
+#define rb4(x)    mm256_rotl_64( x, 32 ) 
+#define rb5(x)    mm256_rotl_64( x, 37 ) 
+#define rb6(x)    mm256_rotl_64( x, 43 ) 
+#define rb7(x)    mm256_rotl_64( x, 53 ) 
+
+#define rol_off( M, j, off ) \
+   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
+                   ( ( (j) + (off) ) & 15 ) + 1 )
+
+#define add_elt_b( M, H, j ) \
+   _mm256_xor_si256( \
+      _mm256_add_epi64( \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
+                                                rol_off( M, j, 3 ) ), \
+                             rol_off( M, j, 10 ) ), \
+            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 15 ] )
+          
+#define expand1b( qt, M, H, i ) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
+                                  sb2( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
+                                  sb0( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
+                                  sb2( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
+                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
+                                  sb2( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
+                                  sb0( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
+                                  sb2( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
+                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#define expand2b( qt, M, H, i) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
+                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#endif
+
+/*
+#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+*/
+
+/*
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+*/
+#if SPH_64
+
+#define Wb0 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb1 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+             _mm256_xor_si256( M[11], H[11] ) ), \
+          _mm256_xor_si256( M[14], H[14] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb2 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb3 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb4 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb5 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb6 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb7 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb8 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb9 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb10 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb11 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+       _mm256_xor_si256( M[ 9], H[ 9] ) )
+
+#define Wb12 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[10], H[10] ) )
+
+#define Wb13 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[11], H[11] ) )
+
+#define Wb14 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[12], H[12] ) )
+
+#define Wb15 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
+{
+   __m256i qt[32], xl, xh; \
+
+   qt[ 0] = sb0( Wb0 ) + H[ 1]; 
+   qt[ 1] = sb1( Wb1 ) + H[ 2]; 
+   qt[ 2] = sb2( Wb2 ) + H[ 3]; 
+   qt[ 3] = sb3( Wb3 ) + H[ 4]; 
+   qt[ 4] = sb4( Wb4 ) + H[ 5]; 
+   qt[ 5] = sb0( Wb5 ) + H[ 6]; 
+   qt[ 6] = sb1( Wb6 ) + H[ 7]; 
+   qt[ 7] = sb2( Wb7 ) + H[ 8]; 
+   qt[ 8] = sb3( Wb8 ) + H[ 9]; 
+   qt[ 9] = sb4( Wb9 ) + H[10]; 
+   qt[10] = sb0( Wb10) + H[11]; 
+   qt[11] = sb1( Wb11) + H[12]; 
+   qt[12] = sb2( Wb12) + H[13]; 
+   qt[13] = sb3( Wb13) + H[14];
+   qt[14] = sb4( Wb14) + H[15]; 
+   qt[15] = sb0( Wb15) + H[ 0]; 
+   qt[16] = expand1b( qt, M, H, 16 ); 
+   qt[17] = expand1b( qt, M, H, 17 ); 
+   qt[18] = expand2b( qt, M, H, 18 ); 
+   qt[19] = expand2b( qt, M, H, 19 ); 
+   qt[20] = expand2b( qt, M, H, 20 ); 
+   qt[21] = expand2b( qt, M, H, 21 ); 
+   qt[22] = expand2b( qt, M, H, 22 ); 
+   qt[23] = expand2b( qt, M, H, 23 ); 
+   qt[24] = expand2b( qt, M, H, 24 ); 
+   qt[25] = expand2b( qt, M, H, 25 ); 
+   qt[26] = expand2b( qt, M, H, 26 ); 
+   qt[27] = expand2b( qt, M, H, 27 ); 
+   qt[28] = expand2b( qt, M, H, 28 ); 
+   qt[29] = expand2b( qt, M, H, 29 ); 
+   qt[30] = expand2b( qt, M, H, 30 ); 
+   qt[31] = expand2b( qt, M, H, 31 ); 
+   xl = _mm256_xor_si256( 
+              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
+                                _mm256_xor_si256( qt[18], qt[19] ) ), 
+              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
+                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
+   xh = _mm256_xor_si256( xl, 
+             _mm256_xor_si256( 
+                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
+                                   _mm256_xor_si256( qt[26], qt[27] ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
+                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+   dH[ 0] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[0],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[1],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[2],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[3],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[4],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[5],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[6],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[7],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[4], 9 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
+                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[5], 10 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[6], 11 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
+   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[7], 12 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
+   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[0], 13 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
+                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
+   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[1], 14 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
+   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[2], 15 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
+                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
+   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[3], 16 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
+                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+} 
+
+#endif  // 64
+
+//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
+
+
+/*
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+*/
+#if SPH_64
+
+static const __m256i final_b[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+static void
+bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   for ( int i = 0; i < 16; i++ )
+      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->ptr = 0;
+   sc->bit_count = 0;
+}
+
+static void
+bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   __m256i htmp[16];
+   __m256i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   sc->bit_count += (sph_u64)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m256i *ht;
+         compress_big( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        memcpy_256( sc->H, h1, 16 );
+}
+
+static void
+bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+   __m256i *buf;
+   __m256i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   unsigned z;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   ptr += 8;
+   h = sc->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
+   compress_big( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_big( buf, final_b, h1 );
+   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+      casti_m256i(dst,u) = h1[v];
+}
+
+#endif
+
+void
+bmw256_4way_init(void *cc)
+{
+//	bmw32_4way_init(cc, IV256);
+}
+
+void
+bmw256_4way(void *cc, const void *data, size_t len)
+{
+//	bmw32_4way(cc, data, len);
+}
+
+void
+bmw256_4way_close(void *cc, void *dst)
+{
+//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+//	bmw32_4way_close(cc, ub, n, dst, 8);
+}
+
+#if SPH_64
+
+void
+bmw512_4way_init(void *cc)
+{
+	bmw64_4way_init(cc, IV512);
+}
+
+void
+bmw512_4way(void *cc, const void *data, size_t len)
+{
+	bmw64_4way(cc, data, len);
+}
+
+void
+bmw512_4way_close(void *cc, void *dst)
+{
+	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_4way_close(cc, ub, n, dst, 8);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,154 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+   __m256i buf[16];
+   __m256i H[16];
+
+//	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+//	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+#endif
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -96,34 +96,18 @@ extern "C"{
 do { \
   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
    x3 = mm256_not( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
    x2 = _mm256_xor_si256( x2, tmp ); \
 } while (0)

-/*
-#define Sb(x0, x1, x2, x3, c)   do { \
-		x3 = ~x3; \
-		x0 ^= (c) & ~x2; \
-		tmp = (c) ^ (x0 & x1); \
-		x0 ^= x2 & x3; \
-		x3 ^= ~x1 & x2; \
-		x1 ^= x0 & x2; \
-		x2 ^= x0 & ~x3; \
-		x0 ^= x1 | x3; \
-		x3 ^= x1 & x2; \
-		x1 ^= tmp & x0; \
-		x2 ^= tmp; \
-	} while (0)
-*/
-
 #define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
 do { \
    x4 = _mm256_xor_si256( x4, x1 ); \
@@ -136,20 +120,6 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-
-/*
-#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
-		x4 ^= x1; \
-		x5 ^= x2; \
-		x6 ^= x3 ^ x0; \
-		x7 ^= x0; \
-		x0 ^= x5; \
-		x1 ^= x6; \
-		x2 ^= x7 ^ x4; \
-		x3 ^= x4; \
-	} while (0)
-*/
-
 #if SPH_JH_64

 static const sph_u64 C[] = {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -4,13 +4,10 @@

 #include <memory.h>
 #include <mm_malloc.h>
-//#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"
-//#include "avxdefs.h"

-// same size, only difference is the name, lyra2 is done serially
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
       blake256_4way( &l2z_4way_blake_mid, input, 64 );
 }

-// block 2050 new algo, blake plus new lyra parms. new input
-// is power of 2 so normal lyra can be used
-//void zcoin_hash(void *state, const void *input, uint32_t height)
 void lyra2z_4way_hash( void *state, const void *input )
 {
-//        uint32_t _ALIGN(64) hash[16];
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-//     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-//     blake256_4way( &ctx_blake, input + (64*4), 16 );
-//     blake256_4way_close( &ctx_blake, vhash );
-
-     blake256_4way_init( &ctx_blake );
-     blake256_4way( &ctx_blake, input, 80 );
+     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
-
-//    memcpy(state, hash, 32);
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-//	uint32_t _ALIGN(64) hash[8];
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

-//   lyra2z_4way_midstate( vdata );
+   lyra2z_4way_midstate( vdata );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -104,42 +90,33 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
-printf("found 0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-/*
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
-*/
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
-printf("found 2\n");          
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
-/*
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
-*/
-      n += 2;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);

@@ -149,21 +126,3 @@ printf("found 3\n");

 #endif

-/*
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-*/
-
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
 bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #ifdef LYRA2Z_4WAY
-  four_way_not_tested();
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
 #else
-  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,7 +2,7 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
 #if defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define NIST5_4WAY
 #endif

--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -1,12 +0,0 @@
-#ifndef __POLYTIMOS_GATE_H__
-#define __POLYTIMOS_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-void polytimos_hash( void *state, const void *input );
-int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-void init_polytimos_context();
-
-#endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,31 +1,20 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
-#include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
 #ifdef NO_AES_NI
        sph_echo512_context echo;
 #else
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,23 +1,16 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/simd/sse2/nist.h"
 #include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -0,0 +1,618 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AVX2__
+
+#include "shabal-hash-4way.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD \
+do { \
+    B0 = _mm_add_epi32( B0, M0 );\
+    B1 = _mm_add_epi32( B1, M1 );\
+    B2 = _mm_add_epi32( B2, M2 );\
+    B3 = _mm_add_epi32( B3, M3 );\
+    B4 = _mm_add_epi32( B4, M4 );\
+    B5 = _mm_add_epi32( B5, M5 );\
+    B6 = _mm_add_epi32( B6, M6 );\
+    B7 = _mm_add_epi32( B7, M7 );\
+    B8 = _mm_add_epi32( B8, M8 );\
+    B9 = _mm_add_epi32( B9, M9 );\
+    BA = _mm_add_epi32( BA, MA );\
+    BB = _mm_add_epi32( BB, MB );\
+    BC = _mm_add_epi32( BC, MC );\
+    BD = _mm_add_epi32( BD, MD );\
+    BE = _mm_add_epi32( BE, ME );\
+    BF = _mm_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB \
+do { \
+    C0 = _mm_sub_epi32( C0, M0 ); \
+    C1 = _mm_sub_epi32( C1, M1 ); \
+    C2 = _mm_sub_epi32( C2, M2 ); \
+    C3 = _mm_sub_epi32( C3, M3 ); \
+    C4 = _mm_sub_epi32( C4, M4 ); \
+    C5 = _mm_sub_epi32( C5, M5 ); \
+    C6 = _mm_sub_epi32( C6, M6 ); \
+    C7 = _mm_sub_epi32( C7, M7 ); \
+    C8 = _mm_sub_epi32( C8, M8 ); \
+    C9 = _mm_sub_epi32( C9, M9 ); \
+    CA = _mm_sub_epi32( CA, MA ); \
+    CB = _mm_sub_epi32( CB, MB ); \
+    CC = _mm_sub_epi32( CC, MC ); \
+    CD = _mm_sub_epi32( CD, MD ); \
+    CE = _mm_sub_epi32( CE, ME ); \
+    CF = _mm_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W \
+do { \
+   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
+   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+} while (0)
+/*
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+*/
+#define SWAP_BC \
+do { \
+    mm_swap_128( B0, C0 ); \
+    mm_swap_128( B1, C1 ); \
+    mm_swap_128( B2, C2 ); \
+    mm_swap_128( B3, C3 ); \
+    mm_swap_128( B4, C4 ); \
+    mm_swap_128( B5, C5 ); \
+    mm_swap_128( B6, C6 ); \
+    mm_swap_128( B7, C7 ); \
+    mm_swap_128( B8, C8 ); \
+    mm_swap_128( B9, C9 ); \
+    mm_swap_128( BA, CA ); \
+    mm_swap_128( BB, CB ); \
+    mm_swap_128( BC, CC ); \
+    mm_swap_128( BD, CD ); \
+    mm_swap_128( BE, CE ); \
+    mm_swap_128( BF, CF ); \
+} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
+            _mm_andnot_si128( xb3, xb2 ), \
+            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
+               _mm_mullo_epi32(  mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P \
+do { \
+    B0 = mm_rotr_32( B0, 15 ); \
+    B1 = mm_rotr_32( B1, 15 ); \
+    B2 = mm_rotr_32( B2, 15 ); \
+    B3 = mm_rotr_32( B3, 15 ); \
+    B4 = mm_rotr_32( B4, 15 ); \
+    B5 = mm_rotr_32( B5, 15 ); \
+    B6 = mm_rotr_32( B6, 15 ); \
+    B7 = mm_rotr_32( B7, 15 ); \
+    B8 = mm_rotr_32( B8, 15 ); \
+    B9 = mm_rotr_32( B9, 15 ); \
+    BA = mm_rotr_32( BA, 15 ); \
+    BB = mm_rotr_32( BB, 15 ); \
+    BC = mm_rotr_32( BC, 15 ); \
+    BD = mm_rotr_32( BD, 15 ); \
+    BE = mm_rotr_32( BE, 15 ); \
+    BF = mm_rotr_32( BF, 15 ); \
+    PERM_STEP_0; \
+    PERM_STEP_1; \
+    PERM_STEP_2; \
+    A0B = _mm_add_epi32( A0B, C6 ); \
+    A0A = _mm_add_epi32( A0A, C5 ); \
+    A09 = _mm_add_epi32( A09, C4 ); \
+    A08 = _mm_add_epi32( A08, C3 ); \
+    A07 = _mm_add_epi32( A07, C2 ); \
+    A06 = _mm_add_epi32( A06, C1 ); \
+    A05 = _mm_add_epi32( A05, C0 ); \
+    A04 = _mm_add_epi32( A04, CF ); \
+    A03 = _mm_add_epi32( A03, CE ); \
+    A02 = _mm_add_epi32( A02, CD ); \
+    A01 = _mm_add_epi32( A01, CC ); \
+    A00 = _mm_add_epi32( A00, CB ); \
+    A0B = _mm_add_epi32( A0B, CA ); \
+    A0A = _mm_add_epi32( A0A, C9 ); \
+    A09 = _mm_add_epi32( A09, C8 ); \
+    A08 = _mm_add_epi32( A08, C7 ); \
+    A07 = _mm_add_epi32( A07, C6 ); \
+    A06 = _mm_add_epi32( A06, C5 ); \
+    A05 = _mm_add_epi32( A05, C4 ); \
+    A04 = _mm_add_epi32( A04, C3 ); \
+    A03 = _mm_add_epi32( A03, C2 ); \
+    A02 = _mm_add_epi32( A02, C1 ); \
+    A01 = _mm_add_epi32( A01, C0 ); \
+    A00 = _mm_add_epi32( A00, CF ); \
+    A0B = _mm_add_epi32( A0B, CE ); \
+    A0A = _mm_add_epi32( A0A, CD ); \
+    A09 = _mm_add_epi32( A09, CC ); \
+    A08 = _mm_add_epi32( A08, CB ); \
+    A07 = _mm_add_epi32( A07, CA ); \
+    A06 = _mm_add_epi32( A06, C9 ); \
+    A05 = _mm_add_epi32( A05, C8 ); \
+    A04 = _mm_add_epi32( A04, C7 ); \
+    A03 = _mm_add_epi32( A03, C6 ); \
+    A02 = _mm_add_epi32( A02, C5 ); \
+    A01 = _mm_add_epi32( A01, C4 ); \
+    A00 = _mm_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+static void
+shabal_4way_init( void *cc, unsigned size )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   int i;
+
+   if ( size == 512 )
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
+      }
+   }
+   else
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
+      }
+    }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+    __m128i *vdata = (__m128i*)data;
+   const int buf_size = 64;  
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+   READ_STATE(sc);
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK;
+         INPUT_BLOCK_ADD;
+         XOR_W;
+         APPLY_P;
+         INPUT_BLOCK_SUB;
+         SWAP_BC;
+         INCR_W;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm_set1_epi32( zz );
+   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE(sc);
+   DECODE_BLOCK;
+   INPUT_BLOCK_ADD;
+   XOR_W;
+   APPLY_P;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC;
+      XOR_W;
+      APPLY_P;
+   }
+
+   __m128i *d = (__m128i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_4way_init( void *cc )
+{
+	shabal_4way_init(cc, 256);
+}
+
+void
+shabal256_4way( void *cc, const void *data, size_t len )
+{
+	shabal_4way_core( cc, data, len );
+}
+
+void
+shabal256_4way_close( void *cc, void *dst )
+{
+	shabal_4way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+	shabal_4way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_4way_init(void *cc)
+{
+	shabal_4way_init(cc, 512);
+}
+
+void
+shabal512_4way(void *cc, const void *data, size_t len)
+{
+	shabal_4way_core(cc, data, len);
+}
+
+void
+shabal512_4way_close(void *cc, void *dst)
+{
+	shabal_4way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_4way_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -0,0 +1,82 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHABAL_HASH_4WAY_H__
+#define SHABAL_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_shabal256   256
+
+#define SPH_SIZE_shabal512   512
+
+typedef struct {
+	__m128i buf[16] __attribute__ ((aligned (64)));
+	__m128i A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+        size_t ptr;
+} shabal_4way_context;
+
+typedef shabal_4way_context shabal256_4way_context;
+typedef shabal_4way_context shabal512_4way_context;
+
+void shabal256_4way_init( void *cc );
+void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_close( void *cc, void *dst );
+void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_4way_init( void *cc );
+void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_close( void *cc, void *dst );
+void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -267,9 +267,6 @@ c512(sph_shavite_big_context *sc, const void *msg)

 #else

-/*
- * This function assumes that "msg" is aligned for 32-bit access.
- */
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
@@ -379,36 +376,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, mm_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, mm_zero );

-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, mm_zero );
      p0 = _mm_xor_si128( p0, x );
@@ -461,36 +458,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );

      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, mm_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );

      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );

      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, mm_zero );
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
+    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
-    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -4,6 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
 {
 #if defined (WHIRLPOOL_4WAY)
  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
  gate->hash      = (void*)&whirlpool_hash_4way;
 #else
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -4,9 +4,11 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+/*
 #if defined(FOUR_WAY) && defined(__AVX2__)
  #define WHIRLPOOL_4WAY
 #endif
+*/

 #if defined (WHIRLPOOL_4WAY) 

--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,10 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
-#define BYTE(x, n) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+//#define BYTE(x, n) \
+//   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+

 // A very complex, but structured, expression with a mix of scalar
 // and vector operations to retrieve specific 64 bit constants from
@@ -3357,23 +3359,51 @@ do { \
 // Extract 64 bit vector elements from "in" representing offsets. Unmask the
 // low byte of each and scale for use as vector indexes.
 // Pack the data in a vector and return it.
+
+/*
 #define t_row( inv, row ) \
   _mm256_and_si256( \
        _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
-
-// Extract vector element from "lane" of vector "in[row]" and use it to index
-// scalar array of constants "table" and return referenced 64 bit entry.
-#define t_lane( table, inv, row, lane ) \
-   table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
-//   table[ t_rwo( inv, row )[ lane ] ];
-
+*/

 // Build a vector from elements of non-contiguous 64 bit data extracted from
 // scalar "table".
+// reference scalar version 1480 kH/s
+/*
+// version 1, extract with gather
+// 955 kH/s
+#define t_lane( inv, row, lane ) \
+    BYTE( _mm256_extract_epi64( inv, lane ), row ) \
+
+
 #define t_vec( table, inv, row ) \
-    _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
-                t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
-                t_lane( table, inv, row, 0 ) )
+   _mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
+                              t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
+                              t_lane( inv, row, 0) ), 1 )
+*/
+/*
+// version 2, extract with set
+// 1100 kH/s 
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+*/
+
+// version 3, vector indexing with set
+// 1105 kH/s
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( inv[ lane ], row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+
+
 
 #if SPH_SMALL_FOOTPRINT_WHIRLPOOL

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -0,0 +1,252 @@
+#include "cpuminer-config.h"
+#include "c11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} c11_4way_ctx_holder;
+
+c11_4way_ctx_holder c11_4way_ctx;
+
+void init_c11_4way_ctx()
+{
+     blake512_4way_init( &c11_4way_ctx.blake );
+     bmw512_4way_init( &c11_4way_ctx.bmw );
+     init_groestl( &c11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &c11_4way_ctx.skein );
+     jh512_4way_init( &c11_4way_ctx.jh );
+     keccak512_4way_init( &c11_4way_ctx.keccak );
+     init_luffa( &c11_4way_ctx.luffa, 512 );
+     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_4way_ctx.shavite );
+     init_sd( &c11_4way_ctx.simd, 512 );
+     init_echo( &c11_4way_ctx.echo, 512 );
+}
+
+void c11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     c11_4way_ctx_holder ctx;
+     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            c11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -0,0 +1,18 @@
+#include "c11-gate.h"
+
+bool register_c11_algo( algo_gate_t* gate )
+{
+#if defined (C11_4WAY)
+  init_c11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_4way;
+  gate->hash      = (void*)&c11_4way_hash;
+#else
+  init_c11_ctx();
+  gate->scanhash  = (void*)&scanhash_c11;
+  gate->hash      = (void*)&c11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -0,0 +1,32 @@
+#ifndef C11_GATE_H__
+#define C11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define C11_4WAY
+#endif
+
+bool register_c11_algo( algo_gate_t* gate );
+
+#if defined(C11_4WAY)
+
+void c11_4way_hash( void *state, const void *input );
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_c11_4way_ctx();
+
+#endif
+
+void c11_hash( void *state, const void *input );
+
+int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_c11_ctx();
+
+#endif
+
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "c11-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
 #endif
 }

-void c11hash( void *output, const void *input )
+void c11_hash( void *output, const void *input )
 {
        unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
 //	uint32_t _ALIGN(64) hash[16];
@@ -157,7 +157,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	do
        {
 		be32enc( &endiandata[19], nonce );
-		c11hash( hash, endiandata );
+		c11_hash( hash, endiandata );
 		if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
                {
 			pdata[19] = nonce;
@@ -171,13 +171,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_c11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_c11_ctx();
-  gate->scanhash  = (void*)&scanhash_c11;
-  gate->hash      = (void*)&c11hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -10,8 +10,14 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
 static __thread jh512_4way_context ctx_mid;
-
+/*
+void init_tribus_4way_ctx()
+{
+     init_echo( &tribus_4way_ctx, 512 );
+}
+*/
 void tribus_hash_4way(void *state, const void *input)
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -1,22 +1,11 @@
 #include "tribus-gate.h"
-/*
-bool tribus_thread_init()
-{
-   sph_jh512_init( &tribus_ctx.jh );
-   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
-   init_echo( &tribus_ctx.echo, 512 );
-#endif
-  return true;
-}
-*/
+
 bool register_tribus_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
+//  init_tribus_4way_ctx();
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
@@ -4,12 +4,14 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define TRIBUS_4WAY
 #endif

 #if defined(TRIBUS_4WAY)

+//void init_tribus_4way_ctx();
+
 void tribus_hash_4way( void *state, const void *input );

 int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
--- a/algo/tribus/tribus.c
+++ b/algo/tribus/tribus.c
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -0,0 +1,252 @@
+#include "cpuminer-config.h"
+#include "x11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11_4way_ctx_holder;
+
+x11_4way_ctx_holder x11_4way_ctx;
+
+void init_x11_4way_ctx()
+{
+     blake512_4way_init( &x11_4way_ctx.blake );
+     bmw512_4way_init( &x11_4way_ctx.bmw );
+     init_groestl( &x11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11_4way_ctx.skein );
+     jh512_4way_init( &x11_4way_ctx.jh );
+     keccak512_4way_init( &x11_4way_ctx.keccak );
+     init_luffa( &x11_4way_ctx.luffa, 512 );
+     cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_4way_ctx.shavite );
+     init_sd( &x11_4way_ctx.simd, 512 );
+     init_echo( &x11_4way_ctx.echo, 512 );
+}
+
+void x11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -5,13 +5,13 @@ bool register_x11_algo( algo_gate_t* gate )
 #if defined (X11_4WAY)
  init_x11_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11_4way;
-  gate->hash      = (void*)&x11_hash_4way;
+  gate->hash      = (void*)&x11_4way_hash;
 #else
  init_x11_ctx();
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -4,19 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
-//  #define X11_4WAY
-//#endif
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11_4WAY
+#endif

 bool register_x11_algo( algo_gate_t* gate );

 #if defined(X11_4WAY)

-void x11_hash_4way( void *state, const void *input );
+void x11_4way_hash( void *state, const void *input );

 int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );

+void init_x11_4way_ctx();
+
 #endif

 void x11_hash( void *state, const void *input );
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -0,0 +1,259 @@
+#include "cpuminer-config.h"
+#include "x11gost-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    sph_gost512_context     gost;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11gost_4way_ctx_holder;
+
+x11gost_4way_ctx_holder x11gost_4way_ctx;
+
+void init_x11gost_4way_ctx()
+{
+     blake512_4way_init( &x11gost_4way_ctx.blake );
+     bmw512_4way_init( &x11gost_4way_ctx.bmw );
+     init_groestl( &x11gost_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11gost_4way_ctx.skein );
+     jh512_4way_init( &x11gost_4way_ctx.jh );
+     keccak512_4way_init( &x11gost_4way_ctx.keccak );
+     sph_gost512_init( &x11gost_4way_ctx.gost );
+     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_4way_ctx.shavite );
+     init_sd( &x11gost_4way_ctx.simd, 512 );
+     init_echo( &x11gost_4way_ctx.echo, 512 );
+}
+
+void x11gost_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11gost_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
+
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11gost_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -0,0 +1,18 @@
+#include "x11gost-gate.h"
+
+bool register_x11gost_algo( algo_gate_t* gate )
+{
+#if defined (X11GOST_4WAY)
+  init_x11gost_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_4way;
+  gate->hash      = (void*)&x11gost_4way_hash;
+#else
+  init_x11gost_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost;
+  gate->hash      = (void*)&x11gost_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -0,0 +1,32 @@
+#ifndef X11GOST_GATE_H__
+#define X11GOST_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11GOST_4WAY
+#endif
+
+bool register_x11gost_algo( algo_gate_t* gate );
+
+#if defined(X11GOST_4WAY)
+
+void x11gost_4way_hash( void *state, const void *input );
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11gost_4way_ctx();
+
+#endif
+
+void x11gost_hash( void *state, const void *input );
+
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11gost_ctx();
+
+#endif
+
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x11gost-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
     hashState_echo          echo;
     hashState_groestl       groestl;
 #endif
-} sib_ctx_holder;
+} x11gost_ctx_holder;

-sib_ctx_holder sib_ctx;
+x11gost_ctx_holder x11gost_ctx;

-void init_sib_ctx()
+void init_x11gost_ctx()
 {
-     sph_gost512_init(&sib_ctx.gost);
-     sph_shavite512_init(&sib_ctx.shavite);
-     init_luffa( &sib_ctx.luffa, 512 );
-     cubehashInit( &sib_ctx.cube, 512, 16, 32 );
-     init_sd( &sib_ctx.simd, 512 );
+     sph_gost512_init( &x11gost_ctx.gost );
+     sph_shavite512_init( &x11gost_ctx.shavite );
+     init_luffa( &x11gost_ctx.luffa, 512 );
+     cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
+     init_sd( &x11gost_ctx.simd, 512 );
 #ifdef NO_AES_NI
-     sph_groestl512_init( &sib_ctx.groestl );
-     sph_echo512_init( &sib_ctx.echo );
+     sph_groestl512_init( &x11gost_ctx.groestl );
+     sph_echo512_init( &x11gost_ctx.echo );
 #else
-     init_echo( &sib_ctx.echo, 512 );
-     init_groestl( &sib_ctx.groestl, 64 );
+     init_echo( &x11gost_ctx.echo, 512 );
+     init_groestl( &x11gost_ctx.groestl, 64 );
 #endif

 }

-void sibhash(void *output, const void *input)
+void x11gost_hash(void *output, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (64)));
     #define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
     sph_u64 hashctA;
     sph_u64 hashctB;

-     sib_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
+     x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );

     DECL_BLK;
     BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
     memcpy(output, hashA, 32);
 }

-int scanhash_sib(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -156,7 +156,7 @@ int scanhash_sib(int thr_id, struct work *work,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		sibhash(hash, endiandata);
+		x11gost_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -172,12 +172,3 @@ int scanhash_sib(int thr_id, struct work *work,
 	return 0;
 }

-bool register_sib_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_sib_ctx();
-    gate->scanhash = (void*)&scanhash_sib;
-    gate->hash     = (void*)&sibhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -0,0 +1,186 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    cubehashParam           cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_4way_ctx_holder;
+
+phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_4way_ctx()
+{
+     skein512_4way_init( &phi1612_4way_ctx.skein );
+     jh512_4way_init( &phi1612_4way_ctx.jh );
+     cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_4way_ctx.fugue );
+     sph_gost512_init( &phi1612_4way_ctx.gost );
+     init_echo( &phi1612_4way_ctx.echo, 512 );
+};
+
+void phi1612_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     phi1612_4way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
+
+     // Skein parallel 4way
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t _ALIGN(64) endiandata[20];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+
+     for ( int k = 0; k < 19; k++ )
+        be32enc( &endiandata[k], pdata[k] );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do {
+        found[0] = found[1] = found[2] = found[3] = false;
+        be32enc( noncep0, n   );
+        be32enc( noncep1, n+1 );
+        be32enc( noncep2, n+2 );
+        be32enc( noncep3, n+3 );
+
+        phi1612_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+        }
+        if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+        {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+        }
+        if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+        {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+        }
+        if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+        {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+        }
+        n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+               && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -0,0 +1,18 @@
+#include "phi1612-gate.h"
+
+bool register_phi1612_algo( algo_gate_t* gate )
+{
+#if defined(PHI1612_4WAY)
+  init_phi1612_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_4way;
+  gate->hash      = (void*)&phi1612_4way_hash;
+#else
+  init_phi1612_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612;
+  gate->hash      = (void*)&phi1612_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/phi1612-gate.h
+++ b/algo/x13/phi1612-gate.h
@@ -0,0 +1,32 @@
+#ifndef PHI1612_GATE_H__
+#define PHI1612_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define PHI1612_4WAY
+#endif
+
+bool register_phi1612_algo( algo_gate_t* gate );
+
+#if defined(PHI1612_4WAY)
+
+void phi1612_4way_hash( void *state, const void *input );
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done );
+
+void init_phi1612_4way_ctx();
+
+#endif
+
+void phi1612_hash( void *state, const void *input );
+
+int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_phi1612_ctx();
+
+#endif
+
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "phi1612-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -33,7 +33,7 @@ phi_ctx_holder phi_ctx;
 static __thread sph_skein512_context phi_skein_mid
                                           __attribute__ ((aligned (64)));

-void init_phi_ctx()
+void init_phi1612_ctx()
 {
     sph_skein512_init( &phi_ctx.skein );
     sph_jh512_init( &phi_ctx.jh );
@@ -53,7 +53,7 @@ void phi_skein_midstate( const void* input )
    sph_skein512( &phi_skein_mid, input, 64 );
 }

-void phi1612hash(void *output, const void *input)
+void phi1612_hash(void *output, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (64)));
     phi_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -112,7 +112,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		phi1612hash(hash, endiandata);
+		phi1612_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -128,12 +128,3 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_phi1612_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_phi_ctx();
-    gate->scanhash = (void*)&scanhash_phi1612;
-    gate->hash     = (void*)&phi1612hash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -0,0 +1,158 @@
+#include "skunk-gate.h"
+
+#ifdef __AVX2__
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+
+typedef struct {
+    skein512_4way_context skein;
+    cubehashParam         cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_4way_ctx_holder;
+
+static __thread skunk_4way_ctx_holder skunk_4way_ctx;
+
+void skunk_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+
+     skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ((uint32_t*)ptarget)[7] = 0x0cff;
+   for ( int k = 0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      skunk_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n +=4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+bool skunk_4way_thread_init()
+{
+   skein512_4way_init( &skunk_4way_ctx.skein );
+   cubehashInit( &skunk_4way_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_4way_ctx.fugue );
+   sph_gost512_init( &skunk_4way_ctx.gost );
+   return true;
+}
+
+#endif
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -0,0 +1,18 @@
+#include "skunk-gate.h"
+
+bool register_skunk_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+#if defined (SKUNK_4WAY)
+   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk_4way;
+   gate->hash     = (void*)&skunk_4way_hash;
+//   init_skunk_4way_ctx();
+#else
+   gate->miner_thread_init = (void*)&skunk_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk;
+   gate->hash     = (void*)&skunkhash;
+#endif
+   return true;
+}
+
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -0,0 +1,33 @@
+#ifndef SKUNK_GATE_H__
+#define SKUNK_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define SKUNK_4WAY
+#endif
+
+bool register_skunk_algo( algo_gate_t* gate );
+
+#if defined(SKUNK_4WAY)
+
+void skunk_4way_hash( void *state, const void *input );
+
+int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+bool skunk_4way_thread_init();
+//void init_skunk_4way_ctx();
+
+#endif
+
+void skunkhash( void *state, const void *input );
+
+int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+bool skunk_thread_init();
+
+#endif
+
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -1,10 +1,8 @@
-#include "algo-gate-api.h"
-
+#include "skunk-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/gost/sph_gost.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/fugue/sph_fugue.h"
@@ -90,12 +88,3 @@ bool skunk_thread_init()
   sph_gost512_init( &skunk_ctx.gost );
   return true;
 }
-
-bool register_skunk_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    gate->miner_thread_init = (void*)&skunk_thread_init;
-    gate->scanhash = (void*)&scanhash_skunk;
-    gate->hash     = (void*)&skunkhash;
-    return true;
-}
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -0,0 +1,284 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13_4way_ctx_holder;
+
+x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x13_4way_ctx()
+{
+     blake512_4way_init( &x13_4way_ctx.blake );
+     bmw512_4way_init( &x13_4way_ctx.bmw );
+     init_groestl( &x13_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13_4way_ctx.skein );
+     jh512_4way_init( &x13_4way_ctx.jh );
+     keccak512_4way_init( &x13_4way_ctx.keccak );
+     init_luffa( &x13_4way_ctx.luffa, 512 );
+     cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_4way_ctx.shavite );
+     init_sd( &x13_4way_ctx.simd, 512 );
+     init_echo( &x13_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x13_4way_ctx.hamsi );
+     sph_fugue512_init( &x13_4way_ctx.fugue );
+};
+
+void x13_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -0,0 +1,18 @@
+#include "x13-gate.h"
+
+bool register_x13_algo( algo_gate_t* gate )
+{
+#if defined (X13_4WAY)
+  init_x13_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_4way;
+  gate->hash      = (void*)&x13_4way_hash;
+#else
+  init_x13_ctx();
+  gate->scanhash  = (void*)&scanhash_x13;
+  gate->hash      = (void*)&x13hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13_GATE_H__
+#define X13_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13_4WAY
+#endif
+
+bool register_x13_algo( algo_gate_t* gate );
+
+#if defined(X13_4WAY)
+
+void x13_4way_hash( void *state, const void *input );
+
+int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13_4way_ctx();
+
+#endif
+
+void x13hash( void *state, const void *input );
+
+int scanhash_x13( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13_ctx();
+
+#endif
+
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -68,7 +68,7 @@ void init_x13_ctx()
        sph_fugue512_init( &x13_ctx.fugue );
 };

-static void x13hash(void *output, const void *input)
+void x13hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -249,15 +249,3 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
  pdata[19] = n;
  return 0;
 }
-
-
-bool register_x13_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x13_ctx();
-  gate->scanhash = (void*)&scanhash_x13;
-  gate->hash     = (void*)&x13hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -0,0 +1,319 @@
+#include "x13sm3-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/sm3/sph_sm3.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sm3_ctx_t               sm3;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+} x13sm3_4way_ctx_holder;
+
+x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context x13sm3_ctx_mid;
+
+void init_x13sm3_4way_ctx()
+{
+     blake512_4way_init( &x13sm3_4way_ctx.blake );
+     bmw512_4way_init( &x13sm3_4way_ctx.bmw );
+     init_groestl( &x13sm3_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x13sm3_4way_ctx.skein );
+     jh512_4way_init( &x13sm3_4way_ctx.jh );
+     keccak512_4way_init( &x13sm3_4way_ctx.keccak );
+     init_luffa( &x13sm3_4way_ctx.luffa, 512 );
+     cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
+     init_sd( &x13sm3_4way_ctx.simd, 512 );
+     init_echo( &x13sm3_4way_ctx.echo, 512 );
+     sm3_init( &x13sm3_4way_ctx.sm3 );
+     sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
+     sph_fugue512_init( &x13sm3_4way_ctx.fugue );
+};
+
+void x13sm3_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x13sm3_4way_ctx_holder ctx;
+     memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
+     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+
+//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // SM3
+     uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash0, 0, sizeof sm3_hash0 );
+     uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash1, 0, sizeof sm3_hash1 );
+     uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash2, 0, sizeof sm3_hash2 );
+     uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
+     memset( sm3_hash3, 0, sizeof sm3_hash3 );
+
+     sph_sm3( &ctx.sm3, hash0, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash0 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash1, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash1 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash2, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash2 );
+     memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
+     sph_sm3( &ctx.sm3, hash3, 64 );
+     sph_sm3_close( &ctx.sm3, sm3_hash3 );
+
+     // Hamsi
+     sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     blake512_4way_init( &x13sm3_ctx_mid );
+     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x13sm3_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -0,0 +1,18 @@
+#include "x13sm3-gate.h"
+
+bool register_x13sm3_algo( algo_gate_t* gate )
+{
+#if defined (X13SM3_4WAY)
+  init_x13sm3_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3_4way;
+  gate->hash      = (void*)&x13sm3_4way_hash;
+#else
+  init_x13sm3_ctx();
+  gate->scanhash  = (void*)&scanhash_x13sm3;
+  gate->hash      = (void*)&x13sm3_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -0,0 +1,32 @@
+#ifndef X13SM3_GATE_H__
+#define X13SM3_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X13SM3_4WAY
+#endif
+
+bool register_x13sm3_algo( algo_gate_t* gate );
+
+#if defined(X13SM3_4WAY)
+
+void x13sm3_4way_hash( void *state, const void *input );
+
+int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x13sm3_4way_ctx();
+
+#endif
+
+void x13sm3_hash( void *state, const void *input );
+
+int scanhash_x13sm3( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x13sm3_ctx();
+
+#endif
+
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x13sm3-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -49,7 +49,7 @@ typedef struct {

 hsr_ctx_holder hsr_ctx;

-void init_hsr_ctx()
+void init_x13sm3_ctx()
 {
 #ifdef NO_AES_NI
        sph_groestl512_init(&hsr_ctx.groestl);
@@ -67,7 +67,7 @@ void init_hsr_ctx()
        sph_fugue512_init(&hsr_ctx.fugue);
 };

-static void x13sm3hash(void *output, const void *input)
+void x13sm3_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));

@@ -213,7 +213,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x13sm3hash(hash64, endiandata);
+				x13sm3_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
@@ -240,13 +240,3 @@ int scanhash_x13sm3( int thr_id, struct work *work,
 	return 0;
 }

-bool register_x13sm3_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_hsr_ctx();
-  gate->scanhash  = (void*)&scanhash_x13sm3;
-  gate->hash      = (void*)&x13sm3hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -0,0 +1,185 @@
+#include "polytimos-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/fugue//sph_fugue.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+//#include "algo/shabal/sph_shabal.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+   skein512_4way_context   skein;
+   shabal512_4way_context  shabal;
+   hashState_echo          echo;
+   hashState_luffa         luffa;
+   sph_fugue512_context    fugue;
+   sph_gost512_context     gost;
+} poly_4way_ctx_holder;
+
+poly_4way_ctx_holder poly_4way_ctx;
+
+void init_polytimos_4way_ctx()
+{
+   skein512_4way_init( &poly_4way_ctx.skein );
+   shabal512_4way_init( &poly_4way_ctx.shabal );
+   init_echo( &poly_4way_ctx.echo, 512  );
+   init_luffa( &poly_4way_ctx.luffa, 512 );
+   sph_fugue512_init( &poly_4way_ctx.fugue );
+   sph_gost512_init( &poly_4way_ctx.gost );
+}
+
+void polytimos_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Need to convert from 64 bit interleaved to 32 bit interleaved.
+     uint32_t vhash32[16*4];
+     mm256_reinterleave_4x32( vhash32, vhash, 512 );
+     shabal512_4way( &ctx.shabal, vhash32, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash32 );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+
+     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                         (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,                              uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      polytimos_4way_hash(hash, vdata);
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/polytimos/polytimos-gate.c
+++ b/algo/polytimos/polytimos-gate.c
@@ -2,10 +2,16 @@

 bool register_polytimos_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+#ifdef POLYTIMOS_4WAY
+  init_polytimos_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_polytimos_4way;
+  gate->hash      = (void*)&polytimos_4way_hash;
+#else
  init_polytimos_context();
  gate->scanhash  = (void*)&scanhash_polytimos;
  gate->hash      = (void*)&polytimos_hash;
+#endif
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -0,0 +1,32 @@
+#ifndef POLYTIMOS_GATE_H__
+#define POLYTIMOS_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define POLYTIMOS_4WAY
+#endif
+
+bool register_polytimos_algo( algo_gate_t* gate );
+
+#if defined(POLYTIMOS_4WAY)
+
+void polytimos_4way_hash( void *state, const void *input );
+
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_polytimos_4way_ctx();
+
+#endif
+
+void polytimos_hash( void *state, const void *input );
+
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_polytimos_ctx();
+
+#endif
+
--- a/algo/polytimos/polytimos.c
+++ b/algo/polytimos/polytimos.c
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -0,0 +1,154 @@
+#include "veltor-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    sph_shavite512_context  shavite;
+    shabal512_4way_context  shabal;
+    sph_gost512_context     gost;
+} veltor_4way_ctx_holder;
+
+veltor_4way_ctx_holder veltor_4way_ctx __attribute__ ((aligned (64)));
+
+void init_veltor_4way_ctx()
+{
+     skein512_4way_init( &veltor_4way_ctx.skein );
+     sph_shavite512_init( &veltor_4way_ctx.shavite );
+     shabal512_4way_init( &veltor_4way_ctx.shabal );
+     sph_gost512_init( &veltor_4way_ctx.gost );
+}
+
+void veltor_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t Htarg = ptarget[7];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+     if ( opt_benchmark )
+        ptarget[7] = 0x0cff;
+     for ( int i=0; i < 19; i++ )
+     {
+        be32enc( &endiandata[i], pdata[i] );
+     }
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     do
+     {
+         found[0] = found[1] = found[2] = found[3] = false;
+         be32enc( noncep0, n   );
+         be32enc( noncep1, n+1 );
+         be32enc( noncep2, n+2 );
+         be32enc( noncep3, n+3 );
+
+         veltor_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+         {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+         }
+         if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+         {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+         }
+         if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+         {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+         }
+         if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+         {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+         }
+         n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x14/veltor-gate.c
+++ b/algo/x14/veltor-gate.c
@@ -0,0 +1,18 @@
+#include "veltor-gate.h"
+
+bool register_veltor_algo( algo_gate_t* gate )
+{
+#if defined (VELTOR_4WAY)
+  init_veltor_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_veltor_4way;
+  gate->hash      = (void*)&veltor_4way_hash;
+#else
+  init_veltor_ctx();
+  gate->scanhash  = (void*)&scanhash_veltor;
+  gate->hash      = (void*)&veltor_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -0,0 +1,32 @@
+#ifndef VELTOR_GATE_H__
+#define VELTOR_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define VELTOR_4WAY
+#endif
+
+bool register_veltor_algo( algo_gate_t* gate );
+
+#if defined(VELTOR_4WAY)
+
+void veltor_4way_hash( void *state, const void *input );
+
+int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_veltor_4way_ctx();
+
+#endif
+
+void veltor_hash( void *state, const void *input );
+
+int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_veltor_ctx();
+
+#endif
+
--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "veltor-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -34,7 +34,7 @@ void veltor_skein512_midstate( const void* input )
    sph_skein512( &veltor_skein_mid, input, 64 );
 }

-void veltorhash(void *output, const void *input)
+void veltor_hash(void *output, const void *input)
 {
 	uint32_t _ALIGN(64) hashA[16], hashB[16];

@@ -85,7 +85,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

 	do {
 		be32enc(&endiandata[19], nonce);
-		veltorhash(hash, endiandata);
+		veltor_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
@@ -101,14 +101,3 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
-bool register_veltor_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT; 
-    init_veltor_ctx();
-    gate->scanhash  = (void*)&scanhash_veltor;
-    gate->hash      = (void*)&veltorhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
-
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -0,0 +1,295 @@
+#include "x14-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+} x14_4way_ctx_holder;
+
+x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_4way_ctx()
+{
+     blake512_4way_init( &x14_4way_ctx.blake );
+     bmw512_4way_init( &x14_4way_ctx.bmw );
+     sph_bmw512_init( &x14_4way_ctx.bmw );
+     init_groestl( &x14_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x14_4way_ctx.skein );
+     jh512_4way_init( &x14_4way_ctx.jh );
+     keccak512_4way_init( &x14_4way_ctx.keccak );
+     init_luffa( &x14_4way_ctx.luffa, 512 );
+     cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_4way_ctx.shavite );
+     init_sd( &x14_4way_ctx.simd, 512 );
+     init_echo( &x14_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x14_4way_ctx.hamsi );
+     sph_fugue512_init( &x14_4way_ctx.fugue );
+     shabal512_4way_init( &x14_4way_ctx.shabal );
+};
+
+void x14_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x14_4way_ctx_holder ctx;
+     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x14_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -0,0 +1,18 @@
+#include "x14-gate.h"
+
+bool register_x14_algo( algo_gate_t* gate )
+{
+#if defined (X14_4WAY)
+  init_x14_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_4way;
+  gate->hash      = (void*)&x14_4way_hash;
+#else
+  init_x14_ctx();
+  gate->scanhash  = (void*)&scanhash_x14;
+  gate->hash      = (void*)&x14hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -0,0 +1,32 @@
+#ifndef X14_GATE_H__
+#define X14_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X14_4WAY
+#endif
+
+bool register_x14_algo( algo_gate_t* gate );
+
+#if defined(X14_4WAY)
+
+void x14_4way_hash( void *state, const void *input );
+
+int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x14_4way_ctx();
+
+#endif
+
+void x14hash( void *state, const void *input );
+
+int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x14_ctx();
+
+#endif
+
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x14-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -72,7 +72,7 @@ void init_x14_ctx()
        sph_shabal512_init(&x14_ctx.shabal);
 };

-static void x14hash(void *output, const void *input)
+void x14hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -248,14 +248,3 @@ int scanhash_x14(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x14_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x14_ctx();
-  gate->scanhash  = (void*)&scanhash_x14;
-  gate->hash      = (void*)&x14hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -0,0 +1,314 @@
+#include "x15-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_4way_ctx_holder;
+
+x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_4way_ctx()
+{
+     blake512_4way_init( &x15_4way_ctx.blake );
+     bmw512_4way_init( &x15_4way_ctx.bmw );
+     sph_bmw512_init( &x15_4way_ctx.bmw );
+     init_groestl( &x15_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x15_4way_ctx.skein );
+     jh512_4way_init( &x15_4way_ctx.jh );
+     keccak512_4way_init( &x15_4way_ctx.keccak );
+     init_luffa( &x15_4way_ctx.luffa, 512 );
+     cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_4way_ctx.shavite );
+     init_sd( &x15_4way_ctx.simd, 512 );
+     init_echo( &x15_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x15_4way_ctx.hamsi );
+     sph_fugue512_init( &x15_4way_ctx.fugue );
+     shabal512_4way_init( &x15_4way_ctx.shabal );
+     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
+};
+
+void x15_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x15_4way_ctx_holder ctx;
+     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x15_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -0,0 +1,17 @@
+#include "x15-gate.h"
+
+bool register_x15_algo( algo_gate_t* gate )
+{
+#if defined (X15_4WAY)
+  init_x15_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_4way;
+  gate->hash      = (void*)&x15_4way_hash;
+#else
+  init_x15_ctx();
+  gate->scanhash  = (void*)&scanhash_x15;
+  gate->hash      = (void*)&x15hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -0,0 +1,32 @@
+#ifndef X15_GATE_H__
+#define X15_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X15_4WAY
+#endif
+
+bool register_x15_algo( algo_gate_t* gate );
+
+#if defined(X15_4WAY)
+
+void x15_4way_hash( void *state, const void *input );
+
+int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x15_4way_ctx();
+
+#endif
+
+void x15hash( void *state, const void *input );
+
+int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x15_ctx();
+
+#endif
+
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x15-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -74,7 +74,7 @@ void init_x15_ctx()
        sph_whirlpool_init( &x15_ctx.whirlpool );
 };

-static void x15hash(void *output, const void *input)
+void x15hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
@@ -260,13 +260,3 @@ int scanhash_x15(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x15_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x15_ctx();
-  gate->scanhash = (void*)&scanhash_x15;
-  gate->hash     = (void*)&x15hash;
-  return true;
-};
-
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -0,0 +1,346 @@
+#include "x17-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    sph_hamsi512_context    hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    SHA512_CTX              sha512;
+    sph_haval256_5_context  haval;
+} x17_4way_ctx_holder;
+
+x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x17_4way_ctx()
+{
+     blake512_4way_init( &x17_4way_ctx.blake );
+     bmw512_4way_init( &x17_4way_ctx.bmw );
+     init_groestl( &x17_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x17_4way_ctx.skein );
+     jh512_4way_init( &x17_4way_ctx.jh );
+     keccak512_4way_init( &x17_4way_ctx.keccak );
+     init_luffa( &x17_4way_ctx.luffa, 512 );
+     cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x17_4way_ctx.shavite );
+     init_sd( &x17_4way_ctx.simd, 512 );
+     init_echo( &x17_4way_ctx.echo, 512 );
+     sph_hamsi512_init( &x17_4way_ctx.hamsi );
+     sph_fugue512_init( &x17_4way_ctx.fugue );
+     shabal512_4way_init( &x17_4way_ctx.shabal );
+     SHA512_Init( &x17_4way_ctx.sha512 );
+     sph_haval256_5_init( &x17_4way_ctx.haval );
+};
+
+void x17_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x17_4way_ctx_holder ctx;
+     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
+
+     // 1 Blake
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     // 12 Hamsi
+     sph_hamsi512( &ctx.hamsi, hash0, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, 64 );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+       
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool, 
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     // 16 SHA512 
+     SHA512_Update( &ctx.sha512, hash0, 64 );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, 64 );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, 64 );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, 64 );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     // 17 Haval
+     sph_haval256_5( &ctx.haval, (const void*)hash0, 64 );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, 64 );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, 64 );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &x17_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, 64 );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for ( int m=0; m < 6; m++ )
+       if ( Htarg <= htmax[m] )
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x17_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -0,0 +1,17 @@
+#include "x17-gate.h"
+
+bool register_x17_algo( algo_gate_t* gate )
+{
+#if defined (X17_4WAY)
+  init_x17_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x17_4way;
+  gate->hash      = (void*)&x17_4way_hash;
+#else
+  init_x17_ctx();
+  gate->scanhash  = (void*)&scanhash_x17;
+  gate->hash      = (void*)&x17_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -0,0 +1,32 @@
+#ifndef X17_GATE_H__
+#define X17_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X17_4WAY
+#endif
+
+bool register_x17_algo( algo_gate_t* gate );
+
+#if defined(X17_4WAY)
+
+void x17_4way_hash( void *state, const void *input );
+
+int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_x17_4way_ctx();
+
+#endif
+
+void x17_hash( void *state, const void *input );
+
+int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_x17_ctx();
+
+#endif
+
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x17-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -86,7 +86,7 @@ void init_x17_ctx()
        sph_haval256_5_init(&x17_ctx.haval);
 };

-static void x17hash(void *output, const void *input)
+void x17_hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (64)));
 	#define hashB hash+64
@@ -248,7 +248,7 @@ int scanhash_x17(int thr_id, struct work *work,
 			do {
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
-				x17hash(hash64, endiandata);
+				x17_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 				if (!(hash64[7] & mask))
                                {
@@ -281,7 +281,7 @@ int scanhash_x17(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_x17_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -290,4 +290,4 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->hash     = (void*)&x17hash;
  return true;
 };
-
+*/
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -0,0 +1,523 @@
+#include "xevan-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sph_sha2.h"
+#include "algo/haval/sph-haval.h"
+#include <openssl/sha.h>
+
+typedef struct {
+        blake512_4way_context   blake;
+        bmw512_4way_context     bmw;
+        hashState_groestl       groestl;
+        skein512_4way_context   skein;
+        jh512_4way_context      jh;
+        keccak512_4way_context  keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        hashState_echo          echo;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        shabal512_4way_context  shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_haval256_5_context  haval;
+} xevan_4way_ctx_holder;
+
+xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_4way_context xevan_blake_4way_mid
+                                        __attribute__ ((aligned (64)));
+
+void init_xevan_4way_ctx()
+{
+        blake512_4way_init(&xevan_4way_ctx.blake);
+        bmw512_4way_init( &xevan_4way_ctx.bmw );
+        init_groestl( &xevan_4way_ctx.groestl, 64 );
+        skein512_4way_init(&xevan_4way_ctx.skein);
+        jh512_4way_init(&xevan_4way_ctx.jh);
+        keccak512_4way_init(&xevan_4way_ctx.keccak);
+        init_luffa( &xevan_4way_ctx.luffa, 512 );
+        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
+        sph_shavite512_init( &xevan_4way_ctx.shavite );
+        init_sd( &xevan_4way_ctx.simd, 512 );
+        init_echo( &xevan_4way_ctx.echo, 512 );
+        sph_hamsi512_init( &xevan_4way_ctx.hamsi );
+        sph_fugue512_init( &xevan_4way_ctx.fugue );
+        shabal512_4way_init( &xevan_4way_ctx.shabal );
+        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
+        SHA512_Init( &xevan_4way_ctx.sha512 );
+        sph_haval256_5_init( &xevan_4way_ctx.haval );
+};
+
+void xevan_4way_blake512_midstate( const void* input )
+{
+    memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
+            sizeof(xevan_blake_4way_mid) );
+    blake512_4way( &xevan_blake_4way_mid, input, 64 );
+}
+
+void xevan_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     const int midlen = 64;            // bytes
+     const int tail   = 80 - midlen;   // 16
+     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     // parallel way
+     memcpy( &ctx.blake, &xevan_blake_4way_mid,
+             sizeof(xevan_blake_4way_mid) );
+     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
+     blake512_4way_close(&ctx.blake, vhash);
+     memset( &vhash[8<<2], 0, 64<<2 );
+
+     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     // Parallel 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // Parallel 4way
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     // Serial
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
+     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
+
+     blake512_4way( &ctx.blake, vhash, dataLen );
+     blake512_4way_close(&ctx.blake, vhash);
+
+     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+
+     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, dataLen );
+     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, dataLen );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
+                           dataLen );
+     memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
+                           dataLen );
+
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, dataLen<<3 );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, dataLen<<3  );
+     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, dataLen<<3  );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+
+     sph_hamsi512( &ctx.hamsi, hash0, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash0 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash1, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash1 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash2, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash2 );
+     memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
+     sph_hamsi512( &ctx.hamsi, hash3, dataLen );
+     sph_hamsi512_close( &ctx.hamsi, hash3 );
+
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+     SHA512_Update( &ctx.sha512, hash0, dataLen );
+     SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash1, dataLen );
+     SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash2, dataLen );
+     SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
+     memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
+     SHA512_Update( &ctx.sha512, hash3, dataLen );
+     SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
+
+     sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash0 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash1 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash2 );
+     memcpy( &ctx.haval, &xevan_4way_ctx.haval,
+             sizeof(sph_haval256_5_context) );
+     sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
+     sph_haval256_5_close( &ctx.haval, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   for ( int k=0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   xevan_4way_blake512_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      xevan_4way_hash( hash, vdata );
+
+      pdata[19] = n;
+
+      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce )
+             && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -0,0 +1,24 @@
+#include "xevan-gate.h"
+
+void xevan_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_xevan_algo( algo_gate_t* gate )
+{
+#if defined (XEVAN_4WAY)
+  init_xevan_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan_4way;
+  gate->hash      = (void*)&xevan_4way_hash;
+#else
+  init_xevan_ctx();
+  gate->scanhash  = (void*)&scanhash_xevan;
+  gate->hash      = (void*)&xevan_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->set_target = (void*)&xevan_set_target;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -0,0 +1,32 @@
+#ifndef XEVAN_GATE_H__
+#define XEVAN_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define XEVAN_4WAY
+#endif
+
+bool register_xevan_algo( algo_gate_t* gate );
+
+#if defined(XEVAN_4WAY)
+
+void xevan_4way_hash( void *state, const void *input );
+
+int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_xevan_4way_ctx();
+
+#endif
+
+void xevan_hash( void *state, const void *input );
+
+int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_xevan_ctx();
+
+#endif
+
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "xevan-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -286,19 +286,3 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
 	return 0;
 }

-void xevan_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_xevan_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_xevan_ctx();
-  gate->scanhash   = (void*)&scanhash_xevan;
-  gate->hash       = (void*)&xevan_hash;
-  gate->set_target = (void*)&xevan_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};
-
--- a/algo/yescrypt/yescrypt-opt.c
+++ b/algo/yescrypt/yescrypt-opt.c
@@ -1,935 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "sha256_Y.h"
-#include "sysendian.h"
-
-#include "yescrypt-platform.h"
-
-static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ = *src++; *dest++ = *src++;
-		*dest++ = *src++; *dest++ = *src++;
-	} while (count -= 4);
-}
-
-static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
-{
-	do {
-		*dest++ ^= *src++; *dest++ ^= *src++;
-		*dest++ ^= *src++; *dest++ ^= *src++;
-	} while (count -= 4);
-}
-
-typedef union {
-	uint32_t w[16];
-	uint64_t d[8];
-} salsa20_blk_t;
-
-static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
-	COMBINE(0, 0, 2)
-	COMBINE(1, 5, 7)
-	COMBINE(2, 2, 4)
-	COMBINE(3, 7, 1)
-	COMBINE(4, 4, 6)
-	COMBINE(5, 1, 3)
-	COMBINE(6, 6, 0)
-	COMBINE(7, 3, 5)
-#undef COMBINE
-}
-
-static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
-{
-#define COMBINE(out, in1, in2) \
-	Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
-	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
-	COMBINE(0, 0, 6)
-	COMBINE(1, 5, 3)
-	COMBINE(2, 2, 0)
-	COMBINE(3, 7, 5)
-	COMBINE(4, 4, 2)
-	COMBINE(5, 1, 7)
-	COMBINE(6, 6, 4)
-	COMBINE(7, 3, 1)
-#undef COMBINE
-}
-
-/**
- * salsa20_8(B):
- * Apply the salsa20/8 core to the provided block.
- */
-static void salsa20_8(uint64_t B[8])
-{
-	size_t i;
-	salsa20_blk_t X;
-#define x X.w
-
-	salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
-
-	for (i = 0; i < 8; i += 2) {
-#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns */
-		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
-		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
-
-		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
-		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
-
-		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
-		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
-
-		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
-		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
-
-		/* Operate on rows */
-		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
-		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
-
-		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
-		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
-
-		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
-		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
-
-		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
-		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
-#undef R
-	}
-#undef x
-
-	{
-		salsa20_blk_t Y;
-		salsa20_simd_shuffle(&X, &Y);
-		for (i = 0; i < 16; i += 4) {
-			((salsa20_blk_t *)B)->w[i] += Y.w[i];
-			((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
-			((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
-			((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
-		}
-	}
-}
-
-/**
- * blockmix_salsa8(Bin, Bout, X, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.  The
- * temporary space X must be 64 bytes.
- */
-static void
-blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
-{
-	size_t i;
-
-	/* 1: X <-- B_{2r - 1} */
-	blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < 2 * r; i += 2) {
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4], X, 8);
-
-		/* 3: X <-- H(X \xor B_i) */
-		blkxor(X, &Bin[i * 8 + 8], 8);
-		salsa20_8(X);
-
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		blkcpy(&Bout[i * 4 + r * 8], X, 8);
-	}
-}
-
-/* These are tunable */
-#define S_BITS 8
-#define S_SIMD 2
-#define S_P 4
-#define S_ROUNDS 6
-
-/* Number of S-boxes.  Not tunable, hard-coded in a few places. */
-#define S_N 2
-
-/* Derived values.  Not tunable on their own. */
-#define S_SIZE1 (1 << S_BITS)
-#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
-#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
-#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
-#define S_P_SIZE (S_P * S_SIMD)
-#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
-
-/**
- * pwxform(B):
- * Transform the provided block using the provided S-boxes.
- */
-static void block_pwxform(uint64_t * B, const uint64_t * S)
-{
-	uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
-	const uint8_t *S0 = (const uint8_t *)S;
-	const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
-	size_t i, j;
-#if S_SIMD > 2
-	size_t k;
-#endif
-
-	for (j = 0; j < S_P; j++) {
-		uint64_t *Xj = X[j];
-		uint64_t x0 = Xj[0];
-#if S_SIMD > 1
-		uint64_t x1 = Xj[1];
-#endif
-
-		for (i = 0; i < S_ROUNDS; i++) {
-			uint64_t x = x0 & S_MASK2;
-			const uint64_t *p0, *p1;
-
-			p0 = (const uint64_t *)(S0 + (uint32_t)x);
-			p1 = (const uint64_t *)(S1 + (x >> 32));
-
-			x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
-			x0 += p0[0];
-			x0 ^= p1[0];
-
-#if S_SIMD > 1
-			x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
-			x1 += p0[1];
-			x1 ^= p1[1];
-#endif
-
-#if S_SIMD > 2
-			for (k = 2; k < S_SIMD; k++) {
-				x = Xj[k];
-
-				x = (uint64_t)(x >> 32) * (uint32_t)x;
-				x += p0[k];
-				x ^= p1[k];
-
-				Xj[k] = x;
-			}
-#endif
-		}
-
-		Xj[0] = x0;
-#if S_SIMD > 1
-		Xj[1] = x1;
-#endif
-	}
-}
-
-/**
- * blockmix_pwxform(Bin, Bout, S, r):
- * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin).  The input Bin must
- * be 128r bytes in length; the output Bout must also be the same size.
- *
- * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
- * need to refer to both functions via the same function pointers.
- */
-static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
-{
-	size_t r1, r2, i;
-
-	/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
-	r1 = r * 128 / (S_P_SIZE * 8);
-
-	/* X <-- B_{r1 - 1} */
-	blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
-
-	/* X <-- X \xor B_i */
-	blkxor(Bout, Bin, S_P_SIZE);
-
-	/* X <-- H'(X) */
-	/* B'_i <-- X */
-	block_pwxform(Bout, S);
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 1; i < r1; i++) {
-		/* X <-- X \xor B_i */
-		blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
-		    S_P_SIZE);
-		blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
-
-		/* X <-- H'(X) */
-		/* B'_i <-- X */
-		block_pwxform(&Bout[i * S_P_SIZE], S);
-	}
-
-	/* Handle partial blocks */
-	if (i * S_P_SIZE < r * 16)
-		blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
-		    r * 16 - i * S_P_SIZE);
-
-	i = (r1 - 1) * S_P_SIZE / 8;
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r2 = r * 2;
-
-	/* B'_i <-- H(B'_i) */
-	salsa20_8(&Bout[i * 8]);
-	i++;
-
-	for (; i < r2; i++) {
-		/* B'_i <-- H(B'_i \xor B'_{i-1}) */
-		blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
-		salsa20_8(&Bout[i * 8]);
-	}
-}
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static __inline uint64_t
-integerify(const uint64_t * B, size_t r)
-{
-/*
- * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
- * word of B_{2r-1} due to SIMD shuffling.  The 64-bit value we return is also
- * in host byte order, as it should be.
- */
-	const uint64_t * X = &B[(2 * r - 1) * 8];
-	uint32_t lo = (uint32_t) X[0];
-	uint32_t hi = (uint32_t) (X[6] >> 32);
-	return ((uint64_t)hi << 32) + lo;
-}
-
-/**
- * smix1(B, r, N, flags, V, NROM, shared, XY, S):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be even and
- * no smaller than 2.
- */
-static void
-smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1;
-	size_t s = 16 * r;
-	uint64_t * X = V;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t n, i, j;
-	size_t k;
-
-	/* 1: X <-- B */
-	/* 3: V_i <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	/* 4: X <-- H(X) */
-	/* 3: V_i <-- X */
-	blockmix(X, Y, Z, r);
-	blkcpy(&V[s], Y, s);
-
-	X = XY;
-
-	if (NROM && (VROM_mask & 1)) {
-		if ((1 & VROM_mask) == 1) {
-			/* j <-- Integerify(X) mod NROM */
-			j = integerify(Y, r) & (NROM - 1);
-
-			/* X <-- H(X \xor VROM_j) */
-			blkxor(Y, &VROM[j * s], s);
-		}
-
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if ((i & (i - 1)) == 0)
-				n <<= 1;
-
-			/* j <-- Wrap(Integerify(X), i) */
-			j = integerify(X, r) & (n - 1);
-			j += i - n;
-
-			/* X <-- X \xor V_j */
-			blkxor(X, &V[j * s], s);
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i + 1 - n;
-
-				/* X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		yescrypt_flags_t rw = flags & YESCRYPT_RW;
-
-		/* 4: X <-- H(X) */
-		blockmix(Y, X, Z, r);
-
-		/* 2: for i = 0 to N - 1 do */
-		for (n = 1, i = 2; i < N; i += 2) {
-			/* 3: V_i <-- X */
-			blkcpy(&V[i * s], X, s);
-
-			if (rw) {
-				if ((i & (i - 1)) == 0)
-					n <<= 1;
-
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(X, r) & (n - 1);
-				j += i - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(X, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(X, Y, Z, r);
-
-			/* 3: V_i <-- X */
-			blkcpy(&V[(i + 1) * s], Y, s);
-
-			if (rw) {
-				/* j <-- Wrap(Integerify(X), i) */
-				j = integerify(Y, r) & (n - 1);
-				j += (i + 1) - n;
-
-				/* X <-- X \xor V_j */
-				blkxor(Y, &V[j * s], s);
-			}
-
-			/* 4: X <-- H(X) */
-			blockmix(Y, X, Z, r);
-		}
-	}
-
-	/* B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r + 64 bytes in length.  The value N must be a
- * power of 2 greater than 1.  The value Nloop must be even.
- */
-static void
-smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
-	    (S ? blockmix_pwxform : blockmix_salsa8);
-	const uint64_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1 | 1;
-	size_t s = 16 * r;
-	yescrypt_flags_t rw = flags & YESCRYPT_RW;
-	uint64_t * X = XY;
-	uint64_t * Y = &XY[s];
-	uint64_t * Z = S ? S : &XY[2 * s];
-	uint64_t i, j;
-	size_t k;
-
-	if (Nloop == 0)
-		return;
-
-	/* X <-- B' */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
-		for (k = 0; k < 16; k++)
-			tmp->w[k] = le32dec(&src->w[k]);
-		salsa20_simd_shuffle(tmp, dst);
-	}
-
-	if (NROM) {
-		/* 6: for i = 0 to N - 1 do */
-		for (i = 0; i < Nloop; i += 2) {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			j = integerify(Y, r);
-			if (((i + 1) & VROM_mask) == 1) {
-				/* j <-- Integerify(X) mod NROM */
-				j &= NROM - 1;
-
-				/* X <-- H(X \xor VROM_j) */
-				blkxor(Y, &VROM[j * s], s);
-			} else {
-				/* 7: j <-- Integerify(X) mod N */
-				j &= N - 1;
-
-				/* 8: X <-- H(X \xor V_j) */
-				blkxor(Y, &V[j * s], s);
-				/* V_j <-- Xprev \xor V_j */
-				if (rw)
-					blkcpy(&V[j * s], Y, s);
-			}
-
-			blockmix(Y, X, Z, r);
-		}
-	} else {
-		/* 6: for i = 0 to N - 1 do */
-		i = Nloop / 2;
-		do {
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(X, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(X, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], X, s);
-			blockmix(X, Y, Z, r);
-
-			/* 7: j <-- Integerify(X) mod N */
-			j = integerify(Y, r) & (N - 1);
-
-			/* 8: X <-- H(X \xor V_j) */
-			blkxor(Y, &V[j * s], s);
-			/* V_j <-- Xprev \xor V_j */
-			if (rw)
-				blkcpy(&V[j * s], Y, s);
-			blockmix(Y, X, Z, r);
-		} while (--i);
-	}
-
-	/* 10: B' <-- X */
-	for (i = 0; i < 2 * r; i++) {
-		const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
-		salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
-		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
-		for (k = 0; k < 16; k++)
-			le32enc(&tmp->w[k], src->w[k]);
-		salsa20_simd_unshuffle(tmp, dst);
-	}
-}
-
-/**
- * p2floor(x):
- * Largest power of 2 not greater than argument.
- */
-static uint64_t
-p2floor(uint64_t x)
-{
-	uint64_t y;
-	while ((y = x & (x - 1)))
-		x = y;
-	return x;
-}
-
-/**
- * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage
- * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
- * required with OpenMP-enabled builds).  The value N must be a power of 2
- * greater than 1.
- */
-static void
-smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
-    yescrypt_flags_t flags,
-    uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
-    uint64_t * XY, uint64_t * S)
-{
-	size_t s = 16 * r;
-	uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
-	uint32_t i;
-
-	Nloop_all = Nchunk;
-	if (flags & YESCRYPT_RW) {
-		if (t <= 1) {
-			if (t)
-				Nloop_all *= 2; /* 2/3 */
-			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
-		} else {
-			Nloop_all *= t - 1;
-		}
-	} else if (t) {
-		if (t == 1)
-			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
-		Nloop_all *= t;
-	}
-
-	Nloop_rw = 0;
-	if (flags & __YESCRYPT_INIT_SHARED)
-		Nloop_rw = Nloop_all;
-	else if (flags & YESCRYPT_RW)
-		Nloop_rw = Nloop_all / p;
-
-	Nchunk &= ~(uint64_t)1; /* round down to even */
-	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
-	Nloop_rw &= ~(uint64_t)1; /* round down to even */
-
-#ifdef _OPENMP
-#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
-	{
-#pragma omp for
-#endif
-	for (i = 0; i < p; i++) {
-		uint64_t Vchunk = i * Nchunk;
-		uint64_t * Bp = &B[i * s];
-		uint64_t * Vp = &V[Vchunk * s];
-#ifdef _OPENMP
-		uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-		uint64_t * XYp = XY;
-#endif
-		uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
-		uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-		if (Sp)
-			smix1(Bp, 1, S_SIZE_ALL / 16,
-			    flags & ~YESCRYPT_PWXFORM,
-			    Sp, NROM, shared, XYp, NULL);
-		if (!(flags & __YESCRYPT_INIT_SHARED_2))
-			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
-		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
-		    NROM, shared, XYp, Sp);
-	}
-
-	if (Nloop_all > Nloop_rw) {
-#ifdef _OPENMP
-#pragma omp for
-#endif
-		for (i = 0; i < p; i++) {
-			uint64_t * Bp = &B[i * s];
-#ifdef _OPENMP
-			uint64_t * XYp = &XY[i * (2 * s + 8)];
-#else
-			uint64_t * XYp = XY;
-#endif
-			uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
-			smix2(Bp, r, N, Nloop_all - Nloop_rw,
-			    flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
-		}
-	}
-#ifdef _OPENMP
-	}
-#endif
-}
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters r, p, and buflen must satisfy
- * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
- * of 2 greater than 1.
- *
- * t controls computation time while not affecting peak memory usage.  shared
- * and flags may request special modes as described in yescrypt.h.  local is
- * the thread-local data structure, allowing to preserve and reuse a memory
- * allocation across calls, thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- */
-int
-yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
-    const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen,
-    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_region_t tmp;
-	uint64_t NROM;
-	size_t B_size, V_size, XY_size, need;
-	uint64_t * B, * V, * XY, * S;
-	uint64_t sha256[4];
-
-	/*
-	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
-	 * so don't let it have side-effects.  Without this adjustment, it'd
-	 * enable the SHA-256 password pre-hashing and output post-hashing,
-	 * because any deviation from classic scrypt implies those.
-	 */
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	/* Sanity-check parameters */
-	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
-		errno = EINVAL;
-		return -1;
-	}
-#if SIZE_MAX > UINT32_MAX
-	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
-		errno = EFBIG;
-		return -1;
-	}
-#endif
-	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
-		errno = EFBIG;
-		return -1;
-	}
-	if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-#if S_MIN_R > 1
-	if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
-		errno = EINVAL;
-		return -1;
-	}
-#endif
-	if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
-#if SIZE_MAX / 256 <= UINT32_MAX
-	    (r > SIZE_MAX / 256) ||
-#endif
-	    (N > SIZE_MAX / 128 / r)) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (N > UINT64_MAX / ((uint64_t)t + 1)) {
-		errno = EFBIG;
-		return -1;
-	}
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
-	    (N > SIZE_MAX / 128 / (r * p))) {
-		errno = ENOMEM;
-		return -1;
-	}
-#endif
-	if ((flags & YESCRYPT_PWXFORM) &&
-#ifndef _OPENMP
-	    (flags & YESCRYPT_PARALLEL_SMIX) &&
-#endif
-	    p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
-		errno = ENOMEM;
-		return -1;
-	}
-
-	NROM = 0;
-	if (shared->shared1.aligned) {
-		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
-		if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
-		    !(flags & YESCRYPT_RW)) {
-			errno = EINVAL;
-			return -1;
-		}
-	}
-
-	/* Allocate memory */
-	V = NULL;
-	V_size = (size_t)128 * r * N;
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX))
-		V_size *= p;
-#endif
-	need = V_size;
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (local->aligned_size < need) {
-			if (local->base || local->aligned ||
-			    local->base_size || local->aligned_size) {
-				errno = EINVAL;
-				return -1;
-			}
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		V = (uint64_t *)local->aligned;
-		need = 0;
-	}
-	B_size = (size_t)128 * r * p;
-	need += B_size;
-	if (need < B_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	XY_size = (size_t)256 * r + 64;
-#ifdef _OPENMP
-	XY_size *= p;
-#endif
-	need += XY_size;
-	if (need < XY_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (flags & YESCRYPT_PWXFORM) {
-		size_t S_size = S_SIZE_ALL * sizeof(*S);
-#ifdef _OPENMP
-		S_size *= p;
-#else
-		if (flags & YESCRYPT_PARALLEL_SMIX)
-			S_size *= p;
-#endif
-		need += S_size;
-		if (need < S_size) {
-			errno = ENOMEM;
-			return -1;
-		}
-	}
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (!alloc_region(&tmp, need))
-			return -1;
-		B = (uint64_t *)tmp.aligned;
-		XY = (uint64_t *)((uint8_t *)B + B_size);
-	} else {
-		init_region(&tmp);
-		if (local->aligned_size < need) {
-			if (free_region(local))
-				return -1;
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		B = (uint64_t *)local->aligned;
-		V = (uint64_t *)((uint8_t *)B + B_size);
-		XY = (uint64_t *)((uint8_t *)V + V_size);
-	}
-	S = NULL;
-	if (flags & YESCRYPT_PWXFORM)
-		S = (uint64_t *)((uint8_t *)XY + XY_size);
-
-	if (t || flags) {
-		SHA256_CTX_Y ctx;
-		SHA256_Init_Y(&ctx);
-		SHA256_Update_Y(&ctx, passwd, passwdlen);
-		SHA256_Final_Y((uint8_t *)sha256, &ctx);
-		passwd = (uint8_t *)sha256;
-		passwdlen = sizeof(sha256);
-	}
-
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
-	    (uint8_t *)B, B_size);
-
-	if (t || flags)
-		blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
-
-	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
-		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
-	} else {
-		uint32_t i;
-
-		/* 2: for i = 0 to p - 1 do */
-#ifdef _OPENMP
-#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
-#endif
-		for (i = 0; i < p; i++) {
-			/* 3: B_i <-- MF(B_i, N) */
-#ifdef _OPENMP
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
-			    &V[(size_t)16 * r * i * N],
-			    NROM, shared,
-			    &XY[((size_t)32 * r + 8) * i],
-			    S ? &S[S_SIZE_ALL * i] : S);
-#else
-			smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
-			    NROM, shared, XY, S);
-#endif
-		}
-	}
-
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
-
-	/*
-	 * Except when computing classic scrypt, allow all computation so far
-	 * to be performed on the client.  The final steps below match those of
-	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
-	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
-	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
-	 */
-	if ((t || flags) && buflen == sizeof(sha256)) {
-		/* Compute ClientKey */
-		{
-			HMAC_SHA256_CTX ctx;
-			HMAC_SHA256_Init(&ctx, buf, buflen);
-			HMAC_SHA256_Update(&ctx, salt, saltlen);
-			HMAC_SHA256_Final((uint8_t *)sha256, &ctx);
-		}
-		/* Compute StoredKey */
-		{
-			SHA256_CTX_Y ctx;
-			SHA256_Init_Y(&ctx);
-			SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
-			SHA256_Final_Y(buf, &ctx);
-		}
-	}
-
-	if (free_region(&tmp))
-		return -1;
-
-	/* Success! */
-	return 0;
-}
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -426,7 +426,7 @@ int64_t yescryptr16_get_max64()

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
@@ -438,9 +438,23 @@ bool register_yescrypt_algo( algo_gate_t* gate )
   return true;
 }

+bool register_yescryptr8_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | SHA_OPT;
+   gate->scanhash   = (void*)&scanhash_yescrypt;
+   gate->hash       = (void*)&yescrypt_hash;
+   gate->set_target = (void*)&scrypt_set_target;
+   gate->get_max64  = (void*)&yescrypt_get_max64;
+   client_key_hack = false;
+   YESCRYPT_N = 2048;
+   YESCRYPT_R = 8;
+   YESCRYPT_P = 1;
+   return true;
+}
+
 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
--- a/avxdefs.h
+++ b/avxdefs.h
--- a/build-4way.sh
+++ b/build-4way.sh
@@ -1,10 +1,5 @@
 #!/bin/bash

-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
 # Linux build

 make distclean || echo clean
@@ -12,14 +7,8 @@ make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done

-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-CFLAGS="-O3 -march=native -Wall -DFOUR_WAY"  ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl

 make -j 4

--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,7 +3,7 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-4way.exe
@@ -13,7 +13,7 @@ mv cpuminer cpuminer-4way
 make clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -23,7 +23,7 @@ mv cpuminer cpuminer-aes-avx2
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
@@ -33,7 +33,7 @@ mv cpuminer cpuminer-aes-avx
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -43,7 +43,7 @@ mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
@@ -53,7 +53,7 @@ mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
+CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.7'
-PACKAGE_STRING='cpuminer-opt 3.7.7'
+PACKAGE_VERSION='3.7.9'
+PACKAGE_STRING='cpuminer-opt 3.7.9'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.7
+cpuminer-opt configure 3.7.9
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.7.7, which was
+It was created by cpuminer-opt $as_me 3.7.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.7'
+ VERSION='3.7.9'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.7, which was
+This file was extended by cpuminer-opt $as_me 3.7.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.7
+cpuminer-opt config.status 3.7.9
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.7])
+AC_INIT([cpuminer-opt], [3.7.9])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -358,8 +358,8 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
-        uint32_t nonces[4];
-        bool     nfound[4];
+        uint32_t nonces[8];
+        bool     nfound[8];
 };

 struct stratum_job {
@@ -546,6 +546,7 @@ enum algos {
        ALGO_X17,
        ALGO_XEVAN,
        ALGO_YESCRYPT,
+        ALGO_YESCRYPTR8,
        ALGO_YESCRYPTR16,
        ALGO_ZR5,
        ALGO_COUNT
@@ -617,6 +618,7 @@ static const char* const algo_names[] = {
        "x17",
        "xevan",
        "yescrypt",
+        "yescryptr8",
        "yescryptr16",
        "zr5",
        "\0"
@@ -741,8 +743,9 @@ Options:\n\
                          x14          X14\n\
                          x15          X15\n\
                          x17\n\
-                          xevan        Bitsend\n\
+                          xevan        Bitsend (BSD)\n\
                          yescrypt     Globlboost-Y (BSTY)\n\
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)\n\
                          zr5          Ziftr\n\
  -o, --url=URL         URL of mining server\n\
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+LOCAL_LIB="$HOME/usr/lib"
+
+export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+
+F="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
+
+sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
+
+mkdir release
+cp README.txt release/
+cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
+cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libstdc++-6.dll release/
+cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libgcc_s_seh-1.dll release/
+cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-4way-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
+make
+mv cpuminer.exe release/cpuminer-4way.exe
+
+make clean || echo clean
+CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
+make
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx-sha.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx2.exe
+
+#make clean || echo clean
+#rm -f config.status
+#CFLAGS="-O3 -march=znver1 -Wall" ./configure $F
+#make -j 
+#strip -s cpuminer.exe
+#mv cpuminer.exe release/cpuminer-aes-sha.exe
+
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F 
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-avx.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-aes-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=corei7 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse42.exe
+
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=core2 -Wall" ./configure $F
+make 
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-sse2.exe
+make clean || echo clean
+
Author	SHA1	Message	Date
Jay D Dee	bee78eac76	v3.7.9	2018-01-08 22:04:43 -05:00
Jay D Dee	2d2e54f001	v3.7.8	2017-12-30 19:19:46 -05:00