v3.7.9

2025-09-17 23:44:27 +00:00 · 2018-01-08 22:04:43 -05:00
parent 2d2e54f001
commit bee78eac76
58 changed files with 2817 additions and 499 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
@@ -56,6 +55,7 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -63,10 +63,8 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
-  algo/drop.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
@@ -113,9 +111,8 @@ cpuminer_SOURCES = \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
-  algo/polytimos/polytimos-gate.c \
-  algo/polytimos/polytimos.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
@@ -127,6 +124,7 @@ cpuminer_SOURCES = \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
@@ -141,15 +139,10 @@ cpuminer_SOURCES = \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
-  algo/skunk.c \
  algo/sm3/sm3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
  algo/timetravel10.c \
-  algo/tribus/tribus-gate.c \
-  algo/tribus/tribus.c \
-  algo/tribus/tribus-4way.c \
-  algo/veltor.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -165,6 +158,10 @@ cpuminer_SOURCES = \
  algo/x11/c11-gate.c \
  algo/x11/c11.c \
  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/fresh.c \
  algo/x11/x11evo.c \
  algo/x13/x13-gate.c \
  algo/x13/x13.c \
@@ -175,9 +172,20 @@ cpuminer_SOURCES = \
  algo/x13/phi1612-gate.c \
  algo/x13/phi1612.c \
  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
  algo/x14/x14-gate.c \
  algo/x14/x14.c \
  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
  algo/x15/x15-gate.c \
  algo/x15/x15.c \
  algo/x15/x15-4way.c \
@@ -189,10 +197,8 @@ cpuminer_SOURCES = \
  algo/x17/xevan-4way.c \
  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Supported Algorithms
                          timetravel10 Bitcore
                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor
+                          veltor       (VLT)
                          whirlpool
                          whirlpoolx
                          x11          Dash
@@ -81,6 +81,7 @@ Supported Algorithms
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

--- a/7
+++ b/7
@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
 v3.7.8

 Partial 4way optimization for most X algos including c11, xevan, phi, hsr
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -219,6 +219,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
@@ -278,6 +279,7 @@ const char* const algo_alias_map[][2] =
 {
 //   alias                proper
  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
  { "blake256r8vnl",     "vanilla"      },
  { "blake256r14",       "blake"        },
@@ -300,10 +302,9 @@ const char* const algo_alias_map[][2] =
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "yes",               "yescrypt"     },
  { "ziftr",             "zr5"          },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8",        "yescrypt"     },
+  { "yescryptr8k",       "yescrypt"     },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include <limits.h>

-//#include "sph_blake.h"
 #include "blake-hash-4way.h"

 #ifdef __cplusplus
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-/*
-  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
- 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
- 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
-  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
-  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
-  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
- 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
- 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
-  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
- 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
-*/
 #endif

 #define Z00   0
@@ -914,34 +901,29 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   unsigned z = 0x80 >> n;
-//   unsigned zz = ((ub & -z) | z) & 0xFF;
-//   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

   if ( ptr == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+	sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
 	sc->T1 = SPH_T32(sc->T1 - 1);
   } 
   else
 	sc->T0 -= 512 - bit_len;

-//   if ( ptr <= 48 )
   if ( ptr <= 52 )
   {
       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-//       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                        _mm_set1_epi32( 0x010000000 ) );
+                                        _mm_set1_epi32( 0x01000000UL ) );
       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
@@ -950,11 +932,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
   {
 	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
 	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
+           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
@@ -962,7 +944,6 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
        out[k] = mm_byteswap_32( sc->H[k] );
-//        out[k] =  sc->H[k];
 }

 #if defined (__AVX2__)
@@ -975,9 +956,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm256_set1_epi64x( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm256_set1_epi64x( salt[i] );
        sc->T0 = sc->T1 = 0;
        sc->ptr = 0;
 }
@@ -1049,12 +1030,12 @@ blake64_4way_close( blake_4way_big_context *sc,
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
 	sc->T1 = SPH_T64(sc->T1 - 1);
   } 
   else
@@ -1066,10 +1047,7 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
-                                    _mm256_set_epi64x( 0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000 ) );
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1082,15 +1060,11 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000 );
-
+           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -0,0 +1,969 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include "bmw-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#include "sph_bmw.h"
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+//#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#undef SPH_ROTL64
+//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
+//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+/*
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+*/
+#if SPH_64
+
+#define sb0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 3) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 4), \
+                                       mm256_rotl_64( (x), 37) ) )
+
+#define sb1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 13), \
+                                       mm256_rotl_64( (x), 43) ) )
+
+#define sb2(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 1) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 19), \
+                                       mm256_rotl_64( (x), 53) ) )
+
+#define sb3(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 28), \
+                                       mm256_rotl_64( (x), 59) ) )
+
+#define sb4(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
+
+#define sb5(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
+
+#define rb1(x)    mm256_rotl_64( x,  5 ) 
+#define rb2(x)    mm256_rotl_64( x, 11 ) 
+#define rb3(x)    mm256_rotl_64( x, 27 ) 
+#define rb4(x)    mm256_rotl_64( x, 32 ) 
+#define rb5(x)    mm256_rotl_64( x, 37 ) 
+#define rb6(x)    mm256_rotl_64( x, 43 ) 
+#define rb7(x)    mm256_rotl_64( x, 53 ) 
+
+#define rol_off( M, j, off ) \
+   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
+                   ( ( (j) + (off) ) & 15 ) + 1 )
+
+#define add_elt_b( M, H, j ) \
+   _mm256_xor_si256( \
+      _mm256_add_epi64( \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
+                                                rol_off( M, j, 3 ) ), \
+                             rol_off( M, j, 10 ) ), \
+            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 15 ] )
+          
+#define expand1b( qt, M, H, i ) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
+                                  sb2( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
+                                  sb0( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
+                                  sb2( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
+                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
+                                  sb2( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
+                                  sb0( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
+                                  sb2( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
+                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#define expand2b( qt, M, H, i) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
+                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#endif
+
+/*
+#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+*/
+
+/*
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+*/
+#if SPH_64
+
+#define Wb0 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb1 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+             _mm256_xor_si256( M[11], H[11] ) ), \
+          _mm256_xor_si256( M[14], H[14] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb2 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb3 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb4 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb5 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb6 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb7 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb8 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb9 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb10 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb11 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+       _mm256_xor_si256( M[ 9], H[ 9] ) )
+
+#define Wb12 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[10], H[10] ) )
+
+#define Wb13 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[11], H[11] ) )
+
+#define Wb14 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[12], H[12] ) )
+
+#define Wb15 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
+{
+   __m256i qt[32], xl, xh; \
+
+   qt[ 0] = sb0( Wb0 ) + H[ 1]; 
+   qt[ 1] = sb1( Wb1 ) + H[ 2]; 
+   qt[ 2] = sb2( Wb2 ) + H[ 3]; 
+   qt[ 3] = sb3( Wb3 ) + H[ 4]; 
+   qt[ 4] = sb4( Wb4 ) + H[ 5]; 
+   qt[ 5] = sb0( Wb5 ) + H[ 6]; 
+   qt[ 6] = sb1( Wb6 ) + H[ 7]; 
+   qt[ 7] = sb2( Wb7 ) + H[ 8]; 
+   qt[ 8] = sb3( Wb8 ) + H[ 9]; 
+   qt[ 9] = sb4( Wb9 ) + H[10]; 
+   qt[10] = sb0( Wb10) + H[11]; 
+   qt[11] = sb1( Wb11) + H[12]; 
+   qt[12] = sb2( Wb12) + H[13]; 
+   qt[13] = sb3( Wb13) + H[14];
+   qt[14] = sb4( Wb14) + H[15]; 
+   qt[15] = sb0( Wb15) + H[ 0]; 
+   qt[16] = expand1b( qt, M, H, 16 ); 
+   qt[17] = expand1b( qt, M, H, 17 ); 
+   qt[18] = expand2b( qt, M, H, 18 ); 
+   qt[19] = expand2b( qt, M, H, 19 ); 
+   qt[20] = expand2b( qt, M, H, 20 ); 
+   qt[21] = expand2b( qt, M, H, 21 ); 
+   qt[22] = expand2b( qt, M, H, 22 ); 
+   qt[23] = expand2b( qt, M, H, 23 ); 
+   qt[24] = expand2b( qt, M, H, 24 ); 
+   qt[25] = expand2b( qt, M, H, 25 ); 
+   qt[26] = expand2b( qt, M, H, 26 ); 
+   qt[27] = expand2b( qt, M, H, 27 ); 
+   qt[28] = expand2b( qt, M, H, 28 ); 
+   qt[29] = expand2b( qt, M, H, 29 ); 
+   qt[30] = expand2b( qt, M, H, 30 ); 
+   qt[31] = expand2b( qt, M, H, 31 ); 
+   xl = _mm256_xor_si256( 
+              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
+                                _mm256_xor_si256( qt[18], qt[19] ) ), 
+              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
+                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
+   xh = _mm256_xor_si256( xl, 
+             _mm256_xor_si256( 
+                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
+                                   _mm256_xor_si256( qt[26], qt[27] ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
+                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+   dH[ 0] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[0],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[1],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[2],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[3],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[4],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[5],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[6],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[7],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[4], 9 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
+                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[5], 10 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[6], 11 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
+   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[7], 12 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
+   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[0], 13 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
+                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
+   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[1], 14 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
+   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[2], 15 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
+                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
+   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[3], 16 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
+                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+} 
+
+#endif  // 64
+
+//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
+
+
+/*
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+*/
+#if SPH_64
+
+static const __m256i final_b[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+static void
+bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   for ( int i = 0; i < 16; i++ )
+      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->ptr = 0;
+   sc->bit_count = 0;
+}
+
+static void
+bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   __m256i htmp[16];
+   __m256i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   sc->bit_count += (sph_u64)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m256i *ht;
+         compress_big( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        memcpy_256( sc->H, h1, 16 );
+}
+
+static void
+bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+   __m256i *buf;
+   __m256i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   unsigned z;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   ptr += 8;
+   h = sc->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
+   compress_big( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_big( buf, final_b, h1 );
+   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+      casti_m256i(dst,u) = h1[v];
+}
+
+#endif
+
+void
+bmw256_4way_init(void *cc)
+{
+//	bmw32_4way_init(cc, IV256);
+}
+
+void
+bmw256_4way(void *cc, const void *data, size_t len)
+{
+//	bmw32_4way(cc, data, len);
+}
+
+void
+bmw256_4way_close(void *cc, void *dst)
+{
+//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+//	bmw32_4way_close(cc, ub, n, dst, 8);
+}
+
+#if SPH_64
+
+void
+bmw512_4way_init(void *cc)
+{
+	bmw64_4way_init(cc, IV512);
+}
+
+void
+bmw512_4way(void *cc, const void *data, size_t len)
+{
+	bmw64_4way(cc, data, len);
+}
+
+void
+bmw512_4way_close(void *cc, void *dst)
+{
+	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_4way_close(cc, ub, n, dst, 8);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,154 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+   __m256i buf[16];
+   __m256i H[16];
+
+//	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+//	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+#endif
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -96,34 +96,18 @@ extern "C"{
 do { \
   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
    x3 = mm256_not( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
    x2 = _mm256_xor_si256( x2, tmp ); \
 } while (0)

-/*
-#define Sb(x0, x1, x2, x3, c)   do { \
-		x3 = ~x3; \
-		x0 ^= (c) & ~x2; \
-		tmp = (c) ^ (x0 & x1); \
-		x0 ^= x2 & x3; \
-		x3 ^= ~x1 & x2; \
-		x1 ^= x0 & x2; \
-		x2 ^= x0 & ~x3; \
-		x0 ^= x1 | x3; \
-		x3 ^= x1 & x2; \
-		x1 ^= tmp & x0; \
-		x2 ^= tmp; \
-	} while (0)
-*/
-
 #define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
 do { \
    x4 = _mm256_xor_si256( x4, x1 ); \
@@ -136,20 +120,6 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-
-/*
-#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
-		x4 ^= x1; \
-		x5 ^= x2; \
-		x6 ^= x3 ^ x0; \
-		x7 ^= x0; \
-		x0 ^= x5; \
-		x1 ^= x6; \
-		x2 ^= x7 ^ x4; \
-		x3 ^= x4; \
-	} while (0)
-*/
-
 #if SPH_JH_64

 static const sph_u64 C[] = {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -4,13 +4,10 @@

 #include <memory.h>
 #include <mm_malloc.h>
-//#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"
-//#include "avxdefs.h"

-// same size, only difference is the name, lyra2 is done serially
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
       blake256_4way( &l2z_4way_blake_mid, input, 64 );
 }

-// block 2050 new algo, blake plus new lyra parms. new input
-// is power of 2 so normal lyra can be used
-//void zcoin_hash(void *state, const void *input, uint32_t height)
 void lyra2z_4way_hash( void *state, const void *input )
 {
-//        uint32_t _ALIGN(64) hash[16];
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-//     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-//     blake256_4way( &ctx_blake, input + (64*4), 16 );
-//     blake256_4way_close( &ctx_blake, vhash );
-
-     blake256_4way_init( &ctx_blake );
-     blake256_4way( &ctx_blake, input, 80 );
+     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
-
-//    memcpy(state, hash, 32);
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-//	uint32_t _ALIGN(64) hash[8];
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

-//   lyra2z_4way_midstate( vdata );
+   lyra2z_4way_midstate( vdata );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -104,42 +90,33 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
-printf("found 0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-/*
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
-*/
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
-printf("found 2\n");          
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
-/*
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
-*/
-      n += 2;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);

@@ -149,21 +126,3 @@ printf("found 3\n");

 #endif

-/*
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-*/
-
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
 bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #ifdef LYRA2Z_4WAY
-  four_way_not_tested();
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
 #else
-  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,7 +2,7 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
 #if defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -1,12 +0,0 @@
-#ifndef __POLYTIMOS_GATE_H__
-#define __POLYTIMOS_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-void polytimos_hash( void *state, const void *input );
-int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-void init_polytimos_context();
-
-#endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,31 +1,20 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
-#include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
 #ifdef NO_AES_NI
        sph_echo512_context echo;
 #else
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,23 +1,16 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/simd/sse2/nist.h"
 #include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -0,0 +1,618 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AVX2__
+
+#include "shabal-hash-4way.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD \
+do { \
+    B0 = _mm_add_epi32( B0, M0 );\
+    B1 = _mm_add_epi32( B1, M1 );\
+    B2 = _mm_add_epi32( B2, M2 );\
+    B3 = _mm_add_epi32( B3, M3 );\
+    B4 = _mm_add_epi32( B4, M4 );\
+    B5 = _mm_add_epi32( B5, M5 );\
+    B6 = _mm_add_epi32( B6, M6 );\
+    B7 = _mm_add_epi32( B7, M7 );\
+    B8 = _mm_add_epi32( B8, M8 );\
+    B9 = _mm_add_epi32( B9, M9 );\
+    BA = _mm_add_epi32( BA, MA );\
+    BB = _mm_add_epi32( BB, MB );\
+    BC = _mm_add_epi32( BC, MC );\
+    BD = _mm_add_epi32( BD, MD );\
+    BE = _mm_add_epi32( BE, ME );\
+    BF = _mm_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB \
+do { \
+    C0 = _mm_sub_epi32( C0, M0 ); \
+    C1 = _mm_sub_epi32( C1, M1 ); \
+    C2 = _mm_sub_epi32( C2, M2 ); \
+    C3 = _mm_sub_epi32( C3, M3 ); \
+    C4 = _mm_sub_epi32( C4, M4 ); \
+    C5 = _mm_sub_epi32( C5, M5 ); \
+    C6 = _mm_sub_epi32( C6, M6 ); \
+    C7 = _mm_sub_epi32( C7, M7 ); \
+    C8 = _mm_sub_epi32( C8, M8 ); \
+    C9 = _mm_sub_epi32( C9, M9 ); \
+    CA = _mm_sub_epi32( CA, MA ); \
+    CB = _mm_sub_epi32( CB, MB ); \
+    CC = _mm_sub_epi32( CC, MC ); \
+    CD = _mm_sub_epi32( CD, MD ); \
+    CE = _mm_sub_epi32( CE, ME ); \
+    CF = _mm_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W \
+do { \
+   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
+   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+} while (0)
+/*
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+*/
+#define SWAP_BC \
+do { \
+    mm_swap_128( B0, C0 ); \
+    mm_swap_128( B1, C1 ); \
+    mm_swap_128( B2, C2 ); \
+    mm_swap_128( B3, C3 ); \
+    mm_swap_128( B4, C4 ); \
+    mm_swap_128( B5, C5 ); \
+    mm_swap_128( B6, C6 ); \
+    mm_swap_128( B7, C7 ); \
+    mm_swap_128( B8, C8 ); \
+    mm_swap_128( B9, C9 ); \
+    mm_swap_128( BA, CA ); \
+    mm_swap_128( BB, CB ); \
+    mm_swap_128( BC, CC ); \
+    mm_swap_128( BD, CD ); \
+    mm_swap_128( BE, CE ); \
+    mm_swap_128( BF, CF ); \
+} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
+            _mm_andnot_si128( xb3, xb2 ), \
+            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
+               _mm_mullo_epi32(  mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P \
+do { \
+    B0 = mm_rotr_32( B0, 15 ); \
+    B1 = mm_rotr_32( B1, 15 ); \
+    B2 = mm_rotr_32( B2, 15 ); \
+    B3 = mm_rotr_32( B3, 15 ); \
+    B4 = mm_rotr_32( B4, 15 ); \
+    B5 = mm_rotr_32( B5, 15 ); \
+    B6 = mm_rotr_32( B6, 15 ); \
+    B7 = mm_rotr_32( B7, 15 ); \
+    B8 = mm_rotr_32( B8, 15 ); \
+    B9 = mm_rotr_32( B9, 15 ); \
+    BA = mm_rotr_32( BA, 15 ); \
+    BB = mm_rotr_32( BB, 15 ); \
+    BC = mm_rotr_32( BC, 15 ); \
+    BD = mm_rotr_32( BD, 15 ); \
+    BE = mm_rotr_32( BE, 15 ); \
+    BF = mm_rotr_32( BF, 15 ); \
+    PERM_STEP_0; \
+    PERM_STEP_1; \
+    PERM_STEP_2; \
+    A0B = _mm_add_epi32( A0B, C6 ); \
+    A0A = _mm_add_epi32( A0A, C5 ); \
+    A09 = _mm_add_epi32( A09, C4 ); \
+    A08 = _mm_add_epi32( A08, C3 ); \
+    A07 = _mm_add_epi32( A07, C2 ); \
+    A06 = _mm_add_epi32( A06, C1 ); \
+    A05 = _mm_add_epi32( A05, C0 ); \
+    A04 = _mm_add_epi32( A04, CF ); \
+    A03 = _mm_add_epi32( A03, CE ); \
+    A02 = _mm_add_epi32( A02, CD ); \
+    A01 = _mm_add_epi32( A01, CC ); \
+    A00 = _mm_add_epi32( A00, CB ); \
+    A0B = _mm_add_epi32( A0B, CA ); \
+    A0A = _mm_add_epi32( A0A, C9 ); \
+    A09 = _mm_add_epi32( A09, C8 ); \
+    A08 = _mm_add_epi32( A08, C7 ); \
+    A07 = _mm_add_epi32( A07, C6 ); \
+    A06 = _mm_add_epi32( A06, C5 ); \
+    A05 = _mm_add_epi32( A05, C4 ); \
+    A04 = _mm_add_epi32( A04, C3 ); \
+    A03 = _mm_add_epi32( A03, C2 ); \
+    A02 = _mm_add_epi32( A02, C1 ); \
+    A01 = _mm_add_epi32( A01, C0 ); \
+    A00 = _mm_add_epi32( A00, CF ); \
+    A0B = _mm_add_epi32( A0B, CE ); \
+    A0A = _mm_add_epi32( A0A, CD ); \
+    A09 = _mm_add_epi32( A09, CC ); \
+    A08 = _mm_add_epi32( A08, CB ); \
+    A07 = _mm_add_epi32( A07, CA ); \
+    A06 = _mm_add_epi32( A06, C9 ); \
+    A05 = _mm_add_epi32( A05, C8 ); \
+    A04 = _mm_add_epi32( A04, C7 ); \
+    A03 = _mm_add_epi32( A03, C6 ); \
+    A02 = _mm_add_epi32( A02, C5 ); \
+    A01 = _mm_add_epi32( A01, C4 ); \
+    A00 = _mm_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+static void
+shabal_4way_init( void *cc, unsigned size )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   int i;
+
+   if ( size == 512 )
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
+      }
+   }
+   else
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
+      }
+    }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+    __m128i *vdata = (__m128i*)data;
+   const int buf_size = 64;  
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+   READ_STATE(sc);
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK;
+         INPUT_BLOCK_ADD;
+         XOR_W;
+         APPLY_P;
+         INPUT_BLOCK_SUB;
+         SWAP_BC;
+         INCR_W;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm_set1_epi32( zz );
+   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE(sc);
+   DECODE_BLOCK;
+   INPUT_BLOCK_ADD;
+   XOR_W;
+   APPLY_P;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC;
+      XOR_W;
+      APPLY_P;
+   }
+
+   __m128i *d = (__m128i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_4way_init( void *cc )
+{
+	shabal_4way_init(cc, 256);
+}
+
+void
+shabal256_4way( void *cc, const void *data, size_t len )
+{
+	shabal_4way_core( cc, data, len );
+}
+
+void
+shabal256_4way_close( void *cc, void *dst )
+{
+	shabal_4way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+	shabal_4way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_4way_init(void *cc)
+{
+	shabal_4way_init(cc, 512);
+}
+
+void
+shabal512_4way(void *cc, const void *data, size_t len)
+{
+	shabal_4way_core(cc, data, len);
+}
+
+void
+shabal512_4way_close(void *cc, void *dst)
+{
+	shabal_4way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_4way_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -0,0 +1,82 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHABAL_HASH_4WAY_H__
+#define SHABAL_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_shabal256   256
+
+#define SPH_SIZE_shabal512   512
+
+typedef struct {
+	__m128i buf[16] __attribute__ ((aligned (64)));
+	__m128i A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+        size_t ptr;
+} shabal_4way_context;
+
+typedef shabal_4way_context shabal256_4way_context;
+typedef shabal_4way_context shabal512_4way_context;
+
+void shabal256_4way_init( void *cc );
+void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_close( void *cc, void *dst );
+void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_4way_init( void *cc );
+void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_close( void *cc, void *dst );
+void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
+
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -7,7 +7,7 @@
 #include <stdint.h>

 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -20,7 +20,7 @@

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;    
@@ -37,7 +37,7 @@ c11_4way_ctx_holder c11_4way_ctx;
 void init_c11_4way_ctx()
 {
     blake512_4way_init( &c11_4way_ctx.blake );
-     sph_bmw512_init( &c11_4way_ctx.bmw );
+     bmw512_4way_init( &c11_4way_ctx.bmw );
     init_groestl( &c11_4way_ctx.groestl, 64 );
     skein512_4way_init( &c11_4way_ctx.skein );
     jh512_4way_init( &c11_4way_ctx.jh );
@@ -63,22 +63,13 @@ void c11_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
--- a/algo/tribus/tribus.c
+++ b/algo/tribus/tribus.c
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -7,7 +7,7 @@
 #include <stdint.h>

 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -20,7 +20,7 @@

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;    
@@ -37,7 +37,7 @@ x11_4way_ctx_holder x11_4way_ctx;
 void init_x11_4way_ctx()
 {
     blake512_4way_init( &x11_4way_ctx.blake );
-     sph_bmw512_init( &x11_4way_ctx.bmw );
+     bmw512_4way_init( &x11_4way_ctx.bmw );
     init_groestl( &x11_4way_ctx.groestl, 64 );
     skein512_4way_init( &x11_4way_ctx.skein );
     jh512_4way_init( &x11_4way_ctx.jh );
@@ -63,22 +63,13 @@ void x11_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -7,7 +7,7 @@
 #include <stdint.h>

 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -21,7 +21,7 @@

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;    
@@ -39,7 +39,7 @@ x11gost_4way_ctx_holder x11gost_4way_ctx;
 void init_x11gost_4way_ctx()
 {
     blake512_4way_init( &x11gost_4way_ctx.blake );
-     sph_bmw512_init( &x11gost_4way_ctx.bmw );
+     bmw512_4way_init( &x11gost_4way_ctx.bmw );
     init_groestl( &x11gost_4way_ctx.groestl, 64 );
     skein512_4way_init( &x11gost_4way_ctx.skein );
     jh512_4way_init( &x11gost_4way_ctx.jh );
@@ -65,21 +65,12 @@ void x11gost_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
             sizeof(hashState_groestl) );
@@ -110,8 +101,8 @@ void x11gost_4way_hash( void *state, const void *input )
     sph_gost512_close( &ctx.gost, hash0 );
     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
     sph_gost512( &ctx.gost, hash1, 64 );
-     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
     sph_gost512( &ctx.gost, hash2, 64 );
     sph_gost512_close( &ctx.gost, hash2 );
     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -0,0 +1,158 @@
+#include "skunk-gate.h"
+
+#ifdef __AVX2__
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+
+typedef struct {
+    skein512_4way_context skein;
+    cubehashParam         cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_4way_ctx_holder;
+
+static __thread skunk_4way_ctx_holder skunk_4way_ctx;
+
+void skunk_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+
+     skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ((uint32_t*)ptarget)[7] = 0x0cff;
+   for ( int k = 0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      skunk_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n +=4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+bool skunk_4way_thread_init()
+{
+   skein512_4way_init( &skunk_4way_ctx.skein );
+   cubehashInit( &skunk_4way_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_4way_ctx.fugue );
+   sph_gost512_init( &skunk_4way_ctx.gost );
+   return true;
+}
+
+#endif
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -0,0 +1,18 @@
+#include "skunk-gate.h"
+
+bool register_skunk_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+#if defined (SKUNK_4WAY)
+   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk_4way;
+   gate->hash     = (void*)&skunk_4way_hash;
+//   init_skunk_4way_ctx();
+#else
+   gate->miner_thread_init = (void*)&skunk_thread_init;
+   gate->scanhash = (void*)&scanhash_skunk;
+   gate->hash     = (void*)&skunkhash;
+#endif
+   return true;
+}
+
--- a/algo/x13/skunk-gate.h
+++ b/algo/x13/skunk-gate.h
@@ -0,0 +1,33 @@
+#ifndef SKUNK_GATE_H__
+#define SKUNK_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define SKUNK_4WAY
+#endif
+
+bool register_skunk_algo( algo_gate_t* gate );
+
+#if defined(SKUNK_4WAY)
+
+void skunk_4way_hash( void *state, const void *input );
+
+int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+bool skunk_4way_thread_init();
+//void init_skunk_4way_ctx();
+
+#endif
+
+void skunkhash( void *state, const void *input );
+
+int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+bool skunk_thread_init();
+
+#endif
+
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -1,10 +1,8 @@
-#include "algo-gate-api.h"
-
+#include "skunk-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/gost/sph_gost.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/fugue/sph_fugue.h"
@@ -90,12 +88,3 @@ bool skunk_thread_init()
   sph_gost512_init( &skunk_ctx.gost );
   return true;
 }
-
-bool register_skunk_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    gate->miner_thread_init = (void*)&skunk_thread_init;
-    gate->scanhash = (void*)&scanhash_skunk;
-    gate->hash     = (void*)&skunkhash;
-    return true;
-}
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -22,7 +22,7 @@

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
@@ -41,7 +41,7 @@ x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
 void init_x13_4way_ctx()
 {
     blake512_4way_init( &x13_4way_ctx.blake );
-     sph_bmw512_init( &x13_4way_ctx.bmw );
+     bmw512_4way_init( &x13_4way_ctx.bmw );
     init_groestl( &x13_4way_ctx.groestl, 64 );
     skein512_4way_init( &x13_4way_ctx.skein );
     jh512_4way_init( &x13_4way_ctx.jh );
@@ -69,22 +69,13 @@ void x13_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -23,7 +23,7 @@

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
@@ -44,7 +44,7 @@ static __thread blake512_4way_context x13sm3_ctx_mid;
 void init_x13sm3_4way_ctx()
 {
     blake512_4way_init( &x13sm3_4way_ctx.blake );
-     sph_bmw512_init( &x13sm3_4way_ctx.bmw );
+     bmw512_4way_init( &x13sm3_4way_ctx.bmw );
     init_groestl( &x13sm3_4way_ctx.groestl, 64 );
     skein512_4way_init( &x13sm3_4way_ctx.skein );
     jh512_4way_init( &x13sm3_4way_ctx.jh );
@@ -76,22 +76,13 @@ void x13sm3_4way_hash( void *state, const void *input )
 //     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -0,0 +1,185 @@
+#include "polytimos-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/fugue//sph_fugue.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+//#include "algo/shabal/sph_shabal.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+   skein512_4way_context   skein;
+   shabal512_4way_context  shabal;
+   hashState_echo          echo;
+   hashState_luffa         luffa;
+   sph_fugue512_context    fugue;
+   sph_gost512_context     gost;
+} poly_4way_ctx_holder;
+
+poly_4way_ctx_holder poly_4way_ctx;
+
+void init_polytimos_4way_ctx()
+{
+   skein512_4way_init( &poly_4way_ctx.skein );
+   shabal512_4way_init( &poly_4way_ctx.shabal );
+   init_echo( &poly_4way_ctx.echo, 512  );
+   init_luffa( &poly_4way_ctx.luffa, 512 );
+   sph_fugue512_init( &poly_4way_ctx.fugue );
+   sph_gost512_init( &poly_4way_ctx.gost );
+}
+
+void polytimos_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Need to convert from 64 bit interleaved to 32 bit interleaved.
+     uint32_t vhash32[16*4];
+     mm256_reinterleave_4x32( vhash32, vhash, 512 );
+     shabal512_4way( &ctx.shabal, vhash32, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash32 );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+
+     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                         (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,                              uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      polytimos_4way_hash(hash, vdata);
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/polytimos/polytimos-gate.c
+++ b/algo/polytimos/polytimos-gate.c
@@ -2,10 +2,16 @@

 bool register_polytimos_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+#ifdef POLYTIMOS_4WAY
+  init_polytimos_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_polytimos_4way;
+  gate->hash      = (void*)&polytimos_4way_hash;
+#else
  init_polytimos_context();
  gate->scanhash  = (void*)&scanhash_polytimos;
  gate->hash      = (void*)&polytimos_hash;
+#endif
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/polytimos-gate.h
+++ b/algo/x14/polytimos-gate.h
@@ -0,0 +1,32 @@
+#ifndef POLYTIMOS_GATE_H__
+#define POLYTIMOS_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define POLYTIMOS_4WAY
+#endif
+
+bool register_polytimos_algo( algo_gate_t* gate );
+
+#if defined(POLYTIMOS_4WAY)
+
+void polytimos_4way_hash( void *state, const void *input );
+
+int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_polytimos_4way_ctx();
+
+#endif
+
+void polytimos_hash( void *state, const void *input );
+
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_polytimos_ctx();
+
+#endif
+
--- a/algo/polytimos/polytimos.c
+++ b/algo/polytimos/polytimos.c
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -0,0 +1,154 @@
+#include "veltor-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    sph_shavite512_context  shavite;
+    shabal512_4way_context  shabal;
+    sph_gost512_context     gost;
+} veltor_4way_ctx_holder;
+
+veltor_4way_ctx_holder veltor_4way_ctx __attribute__ ((aligned (64)));
+
+void init_veltor_4way_ctx()
+{
+     skein512_4way_init( &veltor_4way_ctx.skein );
+     sph_shavite512_init( &veltor_4way_ctx.shavite );
+     shabal512_4way_init( &veltor_4way_ctx.shabal );
+     sph_gost512_init( &veltor_4way_ctx.gost );
+}
+
+void veltor_4way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
+
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t Htarg = ptarget[7];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+     if ( opt_benchmark )
+        ptarget[7] = 0x0cff;
+     for ( int i=0; i < 19; i++ )
+     {
+        be32enc( &endiandata[i], pdata[i] );
+     }
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     do
+     {
+         found[0] = found[1] = found[2] = found[3] = false;
+         be32enc( noncep0, n   );
+         be32enc( noncep1, n+1 );
+         be32enc( noncep2, n+2 );
+         be32enc( noncep3, n+3 );
+
+         veltor_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+         {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+         }
+         if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+         {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+         }
+         if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+         {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+         }
+         if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+         {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+         }
+         n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x14/veltor-gate.c
+++ b/algo/x14/veltor-gate.c
@@ -0,0 +1,18 @@
+#include "veltor-gate.h"
+
+bool register_veltor_algo( algo_gate_t* gate )
+{
+#if defined (VELTOR_4WAY)
+  init_veltor_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_veltor_4way;
+  gate->hash      = (void*)&veltor_4way_hash;
+#else
+  init_veltor_ctx();
+  gate->scanhash  = (void*)&scanhash_veltor;
+  gate->hash      = (void*)&veltor_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x14/veltor-gate.h
+++ b/algo/x14/veltor-gate.h
@@ -0,0 +1,32 @@
+#ifndef VELTOR_GATE_H__
+#define VELTOR_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define VELTOR_4WAY
+#endif
+
+bool register_veltor_algo( algo_gate_t* gate );
+
+#if defined(VELTOR_4WAY)
+
+void veltor_4way_hash( void *state, const void *input );
+
+int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+
+void init_veltor_4way_ctx();
+
+#endif
+
+void veltor_hash( void *state, const void *input );
+
+int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done );
+
+void init_veltor_ctx();
+
+#endif
+
--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "veltor-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -34,7 +34,7 @@ void veltor_skein512_midstate( const void* input )
    sph_skein512( &veltor_skein_mid, input, 64 );
 }

-void veltorhash(void *output, const void *input)
+void veltor_hash(void *output, const void *input)
 {
 	uint32_t _ALIGN(64) hashA[16], hashB[16];

@@ -85,7 +85,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

 	do {
 		be32enc(&endiandata[19], nonce);
-		veltorhash(hash, endiandata);
+		veltor_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
@@ -101,14 +101,3 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
-bool register_veltor_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT; 
-    init_veltor_ctx();
-    gate->scanhash  = (void*)&scanhash_veltor;
-    gate->hash      = (void*)&veltorhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
-
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -20,11 +20,11 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
+#include "algo/shabal/shabal-hash-4way.h"

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
@@ -36,7 +36,7 @@ typedef struct {
    hashState_echo          echo;
    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
-    sph_shabal512_context   shabal;
+    shabal512_4way_context  shabal;
 } x14_4way_ctx_holder;

 x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
@@ -44,6 +44,7 @@ x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
 void init_x14_4way_ctx()
 {
     blake512_4way_init( &x14_4way_ctx.blake );
+     bmw512_4way_init( &x14_4way_ctx.bmw );
     sph_bmw512_init( &x14_4way_ctx.bmw );
     init_groestl( &x14_4way_ctx.groestl, 64 );
     skein512_4way_init( &x14_4way_ctx.skein );
@@ -56,7 +57,7 @@ void init_x14_4way_ctx()
     init_echo( &x14_4way_ctx.echo, 512 );
     sph_hamsi512_init( &x14_4way_ctx.hamsi );
     sph_fugue512_init( &x14_4way_ctx.fugue );
-     sph_shabal512_init( &x14_4way_ctx.shabal );
+     shabal512_4way_init( &x14_4way_ctx.shabal );
 };

 void x14_4way_hash( void *state, const void *input )
@@ -73,22 +74,13 @@ void x14_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -113,7 +105,7 @@ void x14_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

-     // Serial to the end
+     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // 7 Luffa
@@ -144,9 +136,9 @@ void x14_4way_hash( void *state, const void *input )
     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
             sizeof(sph_shavite512_context) );
-     sph_shavite512_close( &ctx.shavite, hash1 );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
@@ -206,19 +198,12 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal
-     sph_shabal512( &ctx.shabal, hash0, 64 );
-     sph_shabal512_close( &ctx.shabal, hash0 );
-     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash1, 64 );
-     sph_shabal512_close( &ctx.shabal, hash1 );
-     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash2, 64 );
-     sph_shabal512_close( &ctx.shabal, hash2 );
-     memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash3, 64 );
-     sph_shabal512_close( &ctx.shabal, hash3 );
-       
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -20,12 +20,12 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
+#include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
@@ -37,7 +37,7 @@ typedef struct {
    hashState_echo          echo;
    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
-    sph_shabal512_context   shabal;
+    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
 } x15_4way_ctx_holder;

@@ -46,6 +46,7 @@ x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
 void init_x15_4way_ctx()
 {
     blake512_4way_init( &x15_4way_ctx.blake );
+     bmw512_4way_init( &x15_4way_ctx.bmw );
     sph_bmw512_init( &x15_4way_ctx.bmw );
     init_groestl( &x15_4way_ctx.groestl, 64 );
     skein512_4way_init( &x15_4way_ctx.skein );
@@ -58,7 +59,7 @@ void init_x15_4way_ctx()
     init_echo( &x15_4way_ctx.echo, 512 );
     sph_hamsi512_init( &x15_4way_ctx.hamsi );
     sph_fugue512_init( &x15_4way_ctx.fugue );
-     sph_shabal512_init( &x15_4way_ctx.shabal );
+     shabal512_4way_init( &x15_4way_ctx.shabal );
     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
 };

@@ -76,22 +77,13 @@ void x15_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -209,18 +201,11 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal
-     sph_shabal512( &ctx.shabal, hash0, 64 );
-     sph_shabal512_close( &ctx.shabal, hash0 );
-     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash1, 64 );
-     sph_shabal512_close( &ctx.shabal, hash1 );
-     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash2, 64 );
-     sph_shabal512_close( &ctx.shabal, hash2 );
-     memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash3, 64 );
-     sph_shabal512_close( &ctx.shabal, hash3 );
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  return true;
 };

--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -17,17 +17,16 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sse2/nist.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
+#include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include <openssl/sha.h>

 typedef struct {
    blake512_4way_context   blake;
-    sph_bmw512_context      bmw;
+    bmw512_4way_context     bmw;
    hashState_groestl       groestl;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
@@ -39,7 +38,7 @@ typedef struct {
    hashState_echo          echo;
    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
-    sph_shabal512_context   shabal;
+    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    SHA512_CTX              sha512;
    sph_haval256_5_context  haval;
@@ -50,7 +49,7 @@ x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
 void init_x17_4way_ctx()
 {
     blake512_4way_init( &x17_4way_ctx.blake );
-     sph_bmw512_init( &x17_4way_ctx.bmw );
+     bmw512_4way_init( &x17_4way_ctx.bmw );
     init_groestl( &x17_4way_ctx.groestl, 64 );
     skein512_4way_init( &x17_4way_ctx.skein );
     jh512_4way_init( &x17_4way_ctx.jh );
@@ -62,8 +61,7 @@ void init_x17_4way_ctx()
     init_echo( &x17_4way_ctx.echo, 512 );
     sph_hamsi512_init( &x17_4way_ctx.hamsi );
     sph_fugue512_init( &x17_4way_ctx.fugue );
-     sph_shabal512_init( &x17_4way_ctx.shabal );
-     sph_whirlpool_init( &x17_4way_ctx.whirlpool );
+     shabal512_4way_init( &x17_4way_ctx.shabal );
     SHA512_Init( &x17_4way_ctx.sha512 );
     sph_haval256_5_init( &x17_4way_ctx.haval );
 };
@@ -82,22 +80,13 @@ void x17_4way_hash( void *state, const void *input )
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 2 Bmw
-     sph_bmw512( &ctx.bmw, hash0, 64 );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, 64 );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, 64 );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, 64 );
-     sph_bmw512_close( &ctx.bmw, hash3 );
-
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -215,18 +204,11 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal
-     sph_shabal512( &ctx.shabal, hash0, 64 );
-     sph_shabal512_close( &ctx.shabal, hash0 );
-     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash1, 64 );
-     sph_shabal512_close( &ctx.shabal, hash1 );
-     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash2, 64 );
-     sph_shabal512_close( &ctx.shabal, hash2 );
-     memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash3, 64 );
-     sph_shabal512_close( &ctx.shabal, hash3 );
+     // 14 Shabal, parallel 32 bit
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  return true;
 };

--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/sph_bmw.h"
+#include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
@@ -19,7 +19,7 @@
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
+#include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
@@ -27,7 +27,7 @@

 typedef struct {
        blake512_4way_context   blake;
-        sph_bmw512_context      bmw;
+        bmw512_4way_context     bmw;
        hashState_groestl       groestl;
        skein512_4way_context   skein;
        jh512_4way_context      jh;
@@ -39,7 +39,7 @@ typedef struct {
        hashState_echo          echo;
        sph_hamsi512_context    hamsi;
        sph_fugue512_context    fugue;
-        sph_shabal512_context   shabal;
+        shabal512_4way_context  shabal;
        sph_whirlpool_context   whirlpool;
        SHA512_CTX              sha512;
        sph_haval256_5_context  haval;
@@ -52,7 +52,7 @@ static __thread blake512_4way_context xevan_blake_4way_mid
 void init_xevan_4way_ctx()
 {
        blake512_4way_init(&xevan_4way_ctx.blake);
-        sph_bmw512_init(&xevan_4way_ctx.bmw);
+        bmw512_4way_init( &xevan_4way_ctx.bmw );
        init_groestl( &xevan_4way_ctx.groestl, 64 );
        skein512_4way_init(&xevan_4way_ctx.skein);
        jh512_4way_init(&xevan_4way_ctx.jh);
@@ -64,7 +64,7 @@ void init_xevan_4way_ctx()
        init_echo( &xevan_4way_ctx.echo, 512 );
        sph_hamsi512_init( &xevan_4way_ctx.hamsi );
        sph_fugue512_init( &xevan_4way_ctx.fugue );
-        sph_shabal512_init( &xevan_4way_ctx.shabal );
+        shabal512_4way_init( &xevan_4way_ctx.shabal );
        sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
        SHA512_Init( &xevan_4way_ctx.sha512 );
        sph_haval256_5_init( &xevan_4way_ctx.haval );
@@ -90,25 +90,18 @@ void xevan_4way_hash( void *output, const void *input )
     xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );

+     // parallel way
     memcpy( &ctx.blake, &xevan_blake_4way_mid,
             sizeof(xevan_blake_4way_mid) );
     blake512_4way( &ctx.blake, input + (midlen<<2), tail );
     blake512_4way_close(&ctx.blake, vhash);
-
     memset( &vhash[8<<2], 0, 64<<2 );
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     sph_bmw512( &ctx.bmw, hash0, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash3 );
+     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
@@ -122,6 +115,7 @@ void xevan_4way_hash( void *output, const void *input )
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
                               dataLen<<3 );

+     // Parallel 4way
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

     skein512_4way( &ctx.skein, vhash, dataLen );
@@ -133,6 +127,7 @@ void xevan_4way_hash( void *output, const void *input )
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );

+     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
@@ -222,21 +217,13 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     sph_shabal512( &ctx.shabal, hash0, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash0 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal, 
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash1, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash1 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash2, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash2 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash3, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash3 );
+     // Parallel 4way
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

+     // Serial
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
@@ -286,19 +273,10 @@ void xevan_4way_hash( void *output, const void *input )
     blake512_4way( &ctx.blake, vhash, dataLen );
     blake512_4way_close(&ctx.blake, vhash);

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_close( &ctx.bmw, vhash );

-     sph_bmw512( &ctx.bmw, hash0, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash0 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash1, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash1 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash2, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash2 );
-     memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
-     sph_bmw512( &ctx.bmw, hash3, dataLen );
-     sph_bmw512_close( &ctx.bmw, hash3 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
                               dataLen<<3 );
@@ -412,20 +390,10 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     sph_shabal512( &ctx.shabal, hash0, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash0 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash1, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash1 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash2, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash2 );
-     memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
-             sizeof(sph_shabal512_context) );
-     sph_shabal512( &ctx.shabal, hash3, dataLen );
-     sph_shabal512_close( &ctx.shabal, hash3 );
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_close( &ctx.shabal, vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -480,7 +448,6 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-//	uint32_t _ALIGN(64) hash[8];
   uint32_t _ALIGN(64) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -16,7 +16,7 @@ bool register_xevan_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_xevan;
  gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->set_target = (void*)&xevan_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -438,6 +438,20 @@ bool register_yescrypt_algo( algo_gate_t* gate )
   return true;
 }

+bool register_yescryptr8_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | SHA_OPT;
+   gate->scanhash   = (void*)&scanhash_yescrypt;
+   gate->hash       = (void*)&yescrypt_hash;
+   gate->set_target = (void*)&scrypt_set_target;
+   gate->get_max64  = (void*)&yescrypt_get_max64;
+   client_key_hack = false;
+   YESCRYPT_N = 2048;
+   YESCRYPT_R = 8;
+   YESCRYPT_P = 1;
+   return true;
+}
+
 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -37,7 +37,7 @@
 #define mm_one_16    _mm_set1_epi16(  1U )

 // Constant minus 1
-#define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFUL )
+#define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )

 //
 // Basic operations without equivalent SIMD intrinsic
@@ -55,11 +55,11 @@

 // Return bit n in position, all other bits zeroed.
 #define mm_bitextract_64 ( x, n ) \
-   _mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
+   _mm_and_si128( _mm_slli_epi64( mm_one_64, n ), x )
 #define mm_bitextract_32 ( x, n ) \
-   _mm_and_si128(  _mm_set1_epi32( 1UL << (n) ), x )
+   _mm_and_si128( _mm_slli_epi32( mm_one_32, n ), x )
 #define mm_bitextract_16 ( x, n ) \
-   _mm_and_si128(  _mm_set1_epi16( 1U << (n) ), x )
+   _mm_and_si128( _mm_slli_epi16( mm_one_16, n ), x )

 // Return bit n as bool
 #define mm_bittest_64( x, n ) \
@@ -343,11 +343,11 @@ inline __m128i mm_byteswap_16( __m128i x )

 // return bit n in position, all othr bits cleared
 #define mm256_bitextract_64 ( x, n ) \
-   _mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
+   _mm256_and_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
 #define mm256_bitextract_32 ( x, n ) \
-   _mm256_and_si128(  _mm256_set1_epi32( 0UL << (n) ), x )
+   _mm256_and_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
 #define mm256_bitextract_16 ( x, n ) \
-   _mm256_and_si128(  _mm256_set1_epi16( 0U << (n) ), x )
+   _mm256_and_si128( _mm256_slli_epi16( mm256_one_16, n ), x )

 // Return bit n as bool (bit 0)
 #define mm256_bittest_64( x, n ) \
@@ -359,17 +359,17 @@ inline __m128i mm_byteswap_16( __m128i x )

 // Return x with bit n set/cleared in all elements
 #define mm256_bitset_64( x, n ) \
-    _mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+    _mm256_or_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
 #define mm256_bitclr_64( x, n ) \
-    _mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
+    _mm256_andnot_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
 #define mm256_bitset_32( x, n ) \
-    _mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
+    _mm256_or_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
 #define mm256_bitclr_32( x, n ) \
-    _mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
+    _mm256_andnot_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
 #define mm256_bitset_16( x, n ) \
-    _mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
+    _mm256_or_si256( _mm256_slli_epi16( mm256_one_16, n ), x )
 #define mm256_bitclr_16( x, n ) \
-    _mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
+    _mm256_andnot_si256( _mm256_slli_epi16( mm256_one_16, n ), x )

 // Return x with bit n toggled
 #define mm256_bitflip_64( x, n ) \
@@ -448,22 +448,21 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
 // shift, a little more work is needed.

 // Optimized 64 bit permutations
-// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
+// Swap 128 bit elements in 256 bit vector
 #define mm256_swap_128( w )      _mm256_permute4x64_epi64( w, 0x4e )
-//#define mm256_swap_128( w )      _mm256_permute2x128_si256( w, w, 1 )

-// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
+// Rotate 256 bit vector by one 64 bit element
 #define mm256_rotl256_1x64( w )  _mm256_permute4x64_epi64( w, 0x93 )
 #define mm256_rotr256_1x64( w )  _mm256_permute4x64_epi64( w, 0x39 )

-// Swap hi/lo 64 bits in each 128 bit element
+// Swap 64 bits in each 128 bit element of 256 bit vector
 #define mm256_swap128_64( x )    _mm256_shuffle_epi32( x, 0x4e )

-// Rotate 128 bit elements by 32 bits
+// Rotate 128 bit elements in 256 bit vector by 32 bits
 #define mm256_rotr128_1x32( x )  _mm256_shuffle_epi32( x, 0x39 )
 #define mm256_rotl128_1x32( x )  _mm256_shuffle_epi32( x, 0x93 )

-// Swap hi/lo 32 bits in each 64 bit element
+// Swap 32 bits in each 64 bit element olf 256 bit vector
 #define mm256_swap64_32( x )     _mm256_shuffle_epi32( x, 0xb1 )

 // Less efficient but more versatile. Use only for rotations that are not 
@@ -487,9 +486,9 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
 // Rotate two 256 bit vectors as one 512 bit vector

 // Fast but limited to 128 bit granularity
-#define mm256_swap512_256(a, b)    _mm256_permute2x128_si256( a, b, 0x1032 )
-#define mm256_rotr512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x0321 )
-#define mm256_rotl512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x2103 )
+#define mm256_swap512_256(a, b)    _mm256_permute2x128_si256( a, b, 0x4e )
+#define mm256_rotr512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x39 )
+#define mm256_rotl512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x93 )

 // Much slower, for 64 and 32 bit granularity
 #define mm256_rotr512_1x64(a, b) \
@@ -677,6 +676,23 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
   d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
   d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
   d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
+   d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
+   d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
+   d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
+
+   d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
+   d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
+   d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
+   d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
+   d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
+   d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
+   d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
+   d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
+   // bit_len == 1024
 }

 // bit_len must be multiple of 32
@@ -735,6 +751,24 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
   d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
   d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
   d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
+
+   if ( bit_len <= 640 ) return;
+
+   d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
+   d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
+   d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
+   d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
+
+   d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
+   d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
+   d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
+   d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
+
+   d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
+   d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
+   d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
+   d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
+   // bit_len == 1024
 }

 // deinterleave 4 arrays into individual buffers for scalarm processing
@@ -1074,6 +1108,41 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
  }
 }

+// Can't do it in place
+inline void mm256_reinterleave_4x64x( void *dst, void *src, int  bit_len )
+{
+   __m256i* d = (__m256i*)dst;
+   uint32_t *s = (uint32_t*)src;
+
+   d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
+   d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
+   d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
+   d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
+   d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
+   d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
+   d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
+   d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
+
+   if ( bit_len <= 640 ) return;
+
+  d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
+  d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
+
+  d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
+  d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
+  d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
+  d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
+   // bit_len == 1024
+}
+
 // likely of no use.
 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
 // bit_len must be multiple of 64
@@ -1081,35 +1150,70 @@ inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
                                         int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
+   uint32_t *s = (uint32_t*)src;
   for ( int i = 0; i < bit_len >> 5; i += 8 )
   {
-      *( d + i     ) = *( src + i     );      // 0 <- 0    8 <- 8
-      *( d + i + 1 ) = *( src + i + 4 );      // 1 <- 4    9 <- 12
-      *( d + i + 2 ) = *( src + i + 1 );      // 2 <- 1    10 <- 9
-      *( d + i + 3 ) = *( src + i + 5 );      // 3 <- 5    11 <- 13
-      *( d + i + 4 ) = *( src + i + 2 );      // 4 <- 2    12 <- 10
-      *( d + i + 5 ) = *( src + i + 6 );      // 5 <- 6    13 <- 14
-      *( d + i + 6 ) = *( src + i + 3 );      // 6 <- 3    14 <- 11
-      *( d + i + 7 ) = *( src + i + 7 );      // 7 <- 7    15 <- 15
+      *( d + i     ) = *( s + i     );      // 0 <- 0    8 <- 8
+      *( d + i + 1 ) = *( s + i + 4 );      // 1 <- 4    9 <- 12
+      *( d + i + 2 ) = *( s + i + 1 );      // 2 <- 1    10 <- 9
+      *( d + i + 3 ) = *( s + i + 5 );      // 3 <- 5    11 <- 13
+      *( d + i + 4 ) = *( s + i + 2 );      // 4 <- 2    12 <- 10
+      *( d + i + 5 ) = *( s + i + 6 );      // 5 <- 6    13 <- 14
+      *( d + i + 6 ) = *( s + i + 3 );      // 6 <- 3    14 <- 11
+      *( d + i + 7 ) = *( s + i + 7 );      // 7 <- 7    15 <- 15
     }
 }

 // convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
 // bit_len must be multiple of 64
-inline void mm_reinterleave_4x32( uint32_t *dst, uint64_t *src,
-                                         int  bit_len )
+inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
 {
+   __m256i  *d = (__m256i*)dst;
+   uint32_t *s = (uint32_t*)src;
+
+   d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
+   d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
+   d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
+   d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
+   d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
+   d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
+   d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
+   d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
+
+   if ( bit_len <= 640 ) return;
+
+   d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
+   d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
+
+   d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
+   d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
+   d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
+   d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
+   // bit_len == 1024
+}
+
+inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
+{
+   uint32_t *d = (uint32_t*)dst;
   uint32_t *s = (uint32_t*)src;
   for ( int i = 0; i < bit_len >> 5; i +=8 )
   {
-      *( dst + i     ) = *( s + i     );
-      *( dst + i + 1 ) = *( s + i + 2 );
-      *( dst + i + 2 ) = *( s + i + 4 );
-      *( dst + i + 3 ) = *( s + i + 6 );
-      *( dst + i + 4 ) = *( s + i + 1 );
-      *( dst + i + 5 ) = *( s + i + 3 );
-      *( dst + i + 6 ) = *( s + i + 5 );
-      *( dst + i + 7 ) = *( s + i + 7 );
+      *( d + i     ) = *( s + i     );
+      *( d + i + 1 ) = *( s + i + 2 );
+      *( d + i + 2 ) = *( s + i + 4 );
+      *( d + i + 3 ) = *( s + i + 6 );
+      *( d + i + 4 ) = *( s + i + 1 );
+      *( d + i + 5 ) = *( s + i + 3 );
+      *( d + i + 6 ) = *( s + i + 5 );
+      *( d + i + 7 ) = *( s + i + 7 );
   }
 }

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.8'
-PACKAGE_STRING='cpuminer-opt 3.7.8'
+PACKAGE_VERSION='3.7.9'
+PACKAGE_STRING='cpuminer-opt 3.7.9'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.8
+cpuminer-opt configure 3.7.9
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.7.8, which was
+It was created by cpuminer-opt $as_me 3.7.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.8'
+ VERSION='3.7.9'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.8, which was
+This file was extended by cpuminer-opt $as_me 3.7.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.8
+cpuminer-opt config.status 3.7.9
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.8])
+AC_INIT([cpuminer-opt], [3.7.9])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -546,6 +546,7 @@ enum algos {
        ALGO_X17,
        ALGO_XEVAN,
        ALGO_YESCRYPT,
+        ALGO_YESCRYPTR8,
        ALGO_YESCRYPTR16,
        ALGO_ZR5,
        ALGO_COUNT
@@ -617,6 +618,7 @@ static const char* const algo_names[] = {
        "x17",
        "xevan",
        "yescrypt",
+        "yescryptr8",
        "yescryptr16",
        "zr5",
        "\0"
@@ -741,8 +743,9 @@ Options:\n\
                          x14          X14\n\
                          x15          X15\n\
                          x17\n\
-                          xevan        Bitsend\n\
+                          xevan        Bitsend (BSD)\n\
                          yescrypt     Globlboost-Y (BSTY)\n\
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)\n\
                          zr5          Ziftr\n\
  -o, --url=URL         URL of mining server\n\
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -31,6 +31,7 @@ CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
 make
 mv cpuminer.exe release/cpuminer-4way.exe

+make clean || echo clean
 CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
 make
 strip -s cpuminer.exe