v3.10.3

2025-09-17 23:44:27 +00:00 · 2019-12-14 01:01:54 -05:00
50 changed files with 473 additions and 6978 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -125,7 +125,6 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/sponge-2way.c \
-  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/README.md
+++ b/README.md
@@ -126,11 +126,11 @@ Supported Algorithms
                          x16rv2        Ravencoin (RVN)
                          x16rt         Gincoin (GIN)
                          x16rt-veil    Veil (VEIL)
-                          x16s          Pigeoncoin (PGN)
+                          x16s          
                          x17
-                          x21s
-                          x22i
-                          x25x
+                          x21s          Pigeoncoin (PGN)
+                          x22i          
+                          x25x          Sinovative (SIN)
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
--- a/11
+++ b/11
@@ -1,8 +1,6 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.

-See also README.md for list of supported algorithms,
-
 Security warning
 ----------------

@@ -33,15 +31,6 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------

-v3.10.5
-
-AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
-Faster hmq1725 AVX2.
-
-v3.10.4
-
-AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
-
 v3.10.3

 AVX512 for x12, x13, x14, x15.
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -874,57 +874,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

-#define DH1L( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
-                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
-
-#define DH1R( m, sl, sr, a, b, c ) \
-   _mm256_add_epi32( \
-               _mm256_xor_si256( M[m], \
-                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
-                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
-               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
-
-#define DH2L( m, rl, sl, h, a, b, c ) \
-   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
-
-#define DH2R( m, rl, sr, h, a, b, c ) \
-   _mm256_add_epi32( _mm256_add_epi32( \
-       mm256_rol_32( dH[h], rl ), \
-          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
-                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
-                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
-
-   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
-   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
-   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
-   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
-   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
-   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
-   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
-   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
-   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
-   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
-   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
-   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
-   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
-   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
-   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
-   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
-
-#undef DH1L
-#undef DH1R
-#undef DH2L
-#undef DH2R
-
-/*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -1005,7 +954,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-*/
 }

 static const __m256i final_s8[16] =
--- a/algo/haval/haval-8way-helper.c
+++ b/algo/haval/haval-8way-helper.c
@@ -1,115 +0,0 @@
-/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
-/*
- * Helper code, included (three times !) by HAVAL implementation.
- *
- * TODO: try to merge this with md_helper.c.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#undef SPH_XCAT
-#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
-#undef SPH_XCAT_
-#define SPH_XCAT_(a, b)   a ## b
-
-static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
-( haval_8way_context *sc, const void *data, size_t len )
-{
-   __m256i *vdata = (__m256i*)data;
-   unsigned current;
-
-   current = (unsigned)sc->count_low & 127U;
-   while ( len > 0 )
-   {
-      unsigned clen;
-      uint32_t clow, clow2;
-
-      clen = 128U - current;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
-      vdata += clen>>2;
-      current += clen;
-      len -= clen;
-      if ( current == 128U )
-      {
-         DSTATE_8W;
-         IN_PREPARE_8W(sc->buf);
-         RSTATE_8W;
-         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
-         WSTATE_8W;
-         current = 0;
-      }
-      clow = sc->count_low;
-      clow2 = clow + clen;
-      sc->count_low = clow2;
-      if ( clow2 < clow )
-         sc->count_high ++;
-   }
-}
-
-static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
-                                                void *dst)
-{
-   unsigned current;
-   DSTATE_8W;
-
-   current = (unsigned)sc->count_low & 127UL;
-
-   sc->buf[ current>>2 ] = m256_one_32;
-   current += 4;   
-   RSTATE_8W;
-   if ( current > 116UL )
-   {
-      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
-      do
-      {
-         IN_PREPARE_8W(sc->buf);
-         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
-      } while (0);
-      current = 0;
-   }
-
-   uint32_t t1, t2;
-   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
-   t1 = 0x01 | (PASSES << 3);
-   t2 = sc->olen << 3;
-   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
-   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
-   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
-                                     | (sc->count_low >> 29) );
-   do
-   {
-      IN_PREPARE_8W(sc->buf);
-      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
-   } while (0);
-   WSTATE_8W;
-   haval_8way_out( sc, dst );
-}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"

-// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
+// won't compile with sse4.2
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)

@@ -518,301 +518,6 @@ do { \

 #define INMSG(i)   msg[i]

-#if defined(__AVX2__)
-
-// Haval-256 8 way 32 bit avx2
-
-#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( x0, \
-       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
-                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
-                                     _mm256_and_si256( x3, x6 ) ) ) ) \
-
-#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( \
-      _mm256_and_si256( x2, \
-         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
-                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
-                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
-         _mm256_xor_si256( \
-             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
-             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
-
-#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-    _mm256_and_si256( x3, \
-      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                     _mm256_xor_si256( x6, x0 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
-                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
-
-#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
-  _mm256_xor_si256( \
-     _mm256_xor_si256( \
-        _mm256_and_si256( x3, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
-                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
-        _mm256_and_si256( x4, \
-           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
-                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
-     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
-
-
-#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
-   _mm256_xor_si256( \
-       _mm256_and_si256( x0, \
-            mm256_not( _mm256_xor_si256( \
-                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
-      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
-                                    _mm256_and_si256( x2, x5 ) ), \
-                                    _mm256_and_si256( x3, x6 ) ) )
-
-#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F1_8W(x1, x0, x3, x5, x6, x2, x4)
-#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F2_8W(x4, x2, x1, x0, x5, x3, x6)
-#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F3_8W(x6, x1, x2, x3, x4, x5, x0)
-
-#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F1_8W(x2, x6, x1, x4, x5, x3, x0)
-#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F2_8W(x3, x5, x2, x0, x1, x6, x4)
-#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F3_8W(x1, x4, x3, x6, x0, x2, x5)
-#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F4_8W(x6, x4, x0, x5, x2, x1, x3)
-
-#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F1_8W(x3, x4, x1, x0, x5, x2, x6)
-#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F2_8W(x6, x2, x1, x0, x3, x4, x5)
-#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F3_8W(x2, x6, x0, x4, x3, x1, x5)
-#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F4_8W(x1, x5, x3, x2, x0, x4, x6)
-#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
-   F5_8W(x2, x5, x0, x6, x4, x3, x1)
-
-#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
-do { \
-   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
-                                      mm256_ror_32( x7, 11 ) ), \
-                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
-} while (0)
-
-#define PASS1_8W(n, in)   do { \
-      unsigned pass_count; \
-      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(pass_count + 0), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(pass_count + 1), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(pass_count + 2), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(pass_count + 3), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(pass_count + 4), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(pass_count + 5), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(pass_count + 6), SPH_C32(0x00000000)); \
-         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(pass_count + 7), SPH_C32(0x00000000)); \
-         } \
-   } while (0)
-
-#define PASSG_8W(p, n, in)   do { \
-      unsigned pass_count; \
-      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
-         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
-            in(MP ## p[pass_count + 0]), \
-            RK ## p[pass_count + 0]); \
-         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
-            in(MP ## p[pass_count + 1]), \
-            RK ## p[pass_count + 1]); \
-         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
-            in(MP ## p[pass_count + 2]), \
-            RK ## p[pass_count + 2]); \
-         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
-            in(MP ## p[pass_count + 3]), \
-            RK ## p[pass_count + 3]); \
-         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
-            in(MP ## p[pass_count + 4]), \
-            RK ## p[pass_count + 4]); \
-         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
-            in(MP ## p[pass_count + 5]), \
-            RK ## p[pass_count + 5]); \
-         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
-            in(MP ## p[pass_count + 6]), \
-            RK ## p[pass_count + 6]); \
-         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
-            in(MP ## p[pass_count + 7]), \
-            RK ## p[pass_count + 7]); \
-         } \
-   } while (0)
-
-#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
-#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
-#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
-#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
-
-#define SAVE_STATE_8W \
-   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
-   do { \
-      u0 = s0; \
-      u1 = s1; \
-      u2 = s2; \
-      u3 = s3; \
-      u4 = s4; \
-      u5 = s5; \
-      u6 = s6; \
-      u7 = s7; \
-   } while (0)
-
-#define UPDATE_STATE_8W \
-do { \
-   s0 = _mm256_add_epi32( s0, u0 ); \
-   s1 = _mm256_add_epi32( s1, u1 ); \
-   s2 = _mm256_add_epi32( s2, u2 ); \
-   s3 = _mm256_add_epi32( s3, u3 ); \
-   s4 = _mm256_add_epi32( s4, u4 ); \
-   s5 = _mm256_add_epi32( s5, u5 ); \
-   s6 = _mm256_add_epi32( s6, u6 ); \
-   s7 = _mm256_add_epi32( s7, u7 ); \
-} while (0)
-
-#define CORE_8W5(in)  do { \
-      SAVE_STATE_8W; \
-      PASS1_8W(5, in); \
-      PASS2_8W(5, in); \
-      PASS3_8W(5, in); \
-      PASS4_8W(5, in); \
-      PASS5_8W(5, in); \
-      UPDATE_STATE_8W; \
-   } while (0)
-
-#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
-
-#define RSTATE_8W \
-do { \
-   s0 = sc->s0; \
-   s1 = sc->s1; \
-   s2 = sc->s2; \
-   s3 = sc->s3; \
-   s4 = sc->s4; \
-   s5 = sc->s5; \
-   s6 = sc->s6; \
-   s7 = sc->s7; \
-} while (0)
-
-#define WSTATE_8W \
-do { \
-   sc->s0 = s0; \
-   sc->s1 = s1; \
-   sc->s2 = s2; \
-   sc->s3 = s3; \
-   sc->s4 = s4; \
-   sc->s5 = s5; \
-   sc->s6 = s6; \
-   sc->s7 = s7; \
-} while (0)
-
-static void
-haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
-{
-   sc->s0 = m256_const1_32( 0x243F6A88UL );
-   sc->s1 = m256_const1_32( 0x85A308D3UL );
-   sc->s2 = m256_const1_32( 0x13198A2EUL );
-   sc->s3 = m256_const1_32( 0x03707344UL );
-   sc->s4 = m256_const1_32( 0xA4093822UL );
-   sc->s5 = m256_const1_32( 0x299F31D0UL );
-   sc->s6 = m256_const1_32( 0x082EFA98UL );
-   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
-   sc->olen = olen;
-   sc->passes = passes;
-   sc->count_high = 0;
-   sc->count_low = 0;
-
-}
-#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
-
-#define INW_8W(i)   load_ptr_8w[ i ] 
-
-static void
-haval_8way_out( haval_8way_context *sc, void *dst )
-{
-   __m256i *buf = (__m256i*)dst;
-   DSTATE_8W;
-   RSTATE_8W;
-
-   buf[0] = s0;
-   buf[1] = s1;
-   buf[2] = s2;
-   buf[3] = s3;
-   buf[4] = s4;
-   buf[5] = s5;
-   buf[6] = s6;
-   buf[7] = s7;
-}
-
-#undef PASSES
-#define PASSES   5
-#include "haval-8way-helper.c"
-
-#define API_8W(xxx, y) \
-void \
-haval ## xxx ## _ ## y ## _8way_init(void *cc) \
-{ \
-   haval_8way_init(cc, xxx >> 5, y); \
-} \
- \
-void \
-haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
-{ \
-   haval ## y ## _8way_update(cc, data, len); \
-} \
- \
-void \
-haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
-{ \
-   haval ## y ## _8way_close(cc, dst); \
-} \
-
-API_8W(256, 5)
-
-#define RVAL_8W \
-do { \
-   s0 = val[0]; \
-   s1 = val[1]; \
-   s2 = val[2]; \
-   s3 = val[3]; \
-   s4 = val[4]; \
-   s5 = val[5]; \
-   s6 = val[6]; \
-   s7 = val[7]; \
-} while (0)
-
-#define WVAL_8W \
-do { \
-   val[0] = s0; \
-   val[1] = s1; \
-   val[2] = s2; \
-   val[3] = s3; \
-   val[4] = s4; \
-   val[5] = s5; \
-   val[6] = s6; \
-   val[7] = s7; \
-} while (0)
-
-#define INMSG_8W(i)   msg[i]
-
-
-
-#endif // AVX2
-
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
 */

 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__ 1
+#define HAVAL_HASH_4WAY_H__

 #if defined(__AVX__)

@@ -84,30 +84,10 @@ typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );

-void haval256_5_4way_update( void *cc, const void *data, size_t len );
-#define haval256_5_4way haval256_5_4way_update
+void haval256_5_4way( void *cc, const void *data, size_t len );

 void haval256_5_4way_close( void *cc, void *dst );

-#if defined(__AVX2__)
-
-typedef struct {
-   __m256i buf[32];
-   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
-   unsigned olen, passes;
-   uint32_t count_high, count_low;
-} haval_8way_context __attribute__ ((aligned (64)));
-
-typedef haval_8way_context haval256_5_8way_context;
-
-void haval256_5_8way_init( void *cc );
-
-void haval256_5_8way_update( void *cc, const void *data, size_t len );
-
-void haval256_5_8way_close( void *cc, void *dst );
-
-#endif // AVX2
-
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,7 +78,8 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+//  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -94,14 +95,10 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-#if defined (LYRA2REV2_8WAY)
-   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
-   init_lyra2rev2_8way_ctx();;
-#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+#if defined (LYRA2REV2_4WAY)
   init_lyra2rev2_4way_ctx();;
 #else
-   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -109,17 +106,14 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_8WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
-  gate->hash      = (void*)&lyra2rev2_8way_hash;
-#elif defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,10 +5,10 @@
 #include <stdint.h>
 #include "lyra2.h"

-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define LYRA2REV3_16WAY 1
-#elif defined(__AVX2__)
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define LYRA2REV3_16WAY 1
+//#elif defined(__AVX2__)
+#if defined(__AVX2__)
  #define LYRA2REV3_8WAY 1
 #elif defined(__SSE2__)
  #define LYRA2REV3_4WAY 1
@@ -50,24 +50,15 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define LYRA2REV2_8WAY 1
-#elif defined(__AVX2__)
-  #define LYRA2REV2_4WAY 1
+#if defined(__AVX2__)
+  #define LYRA2REV2_4WAY
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_8WAY)
-
-void lyra2rev2_8way_hash( void *state, const void *input );
-int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr );
-bool init_lyra2rev2_8way_ctx();
-
-#elif defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -1,695 +0,0 @@
-/**
- * Implementation of the Lyra2 Password Hashing Scheme (PHS).
- *
- * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
- *
- * This software is hereby placed in the public domain.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
- * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <mm_malloc.h>
-#include "compat.h"
-#include "lyra2.h"
-#include "sponge.h"
-
-//  LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
-//
-//  LYRA2REV2 4 cols 4 rows used by lyra2rev2.
-//
-//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
-//  rowa in the wandering phase. Used by lyra2rev3.
-// 
-//  LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
-//  lyra2z330, lyra2h, 
-
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-/**
- * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
- * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
- * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
- * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
- * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
- *
- * @param K The derived key to be output by the algorithm
- * @param kLen Desired key length
- * @param pwd User password
- * @param pwdlen Password length
- * @param salt Salt
- * @param saltlen Salt length
- * @param timeCost Parameter to determine the processing time (T)
- * @param nRows Number or rows of the memory matrix (R)
- * @param nCols Number of columns of the memory matrix (C)
- *
- * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
- */
-
-// For lyra2rev3.
-// convert a simple offset to an index into interleaved data.
-// good for state and 4 row matrix. 
-// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
-
-#define offset_to_index( o ) \
-   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
-
-
-int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
-             const uint64_t nRows, const uint64_t nCols )
-{
-   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[32];
-   int64_t row = 2;
-   int64_t prev = 1;
-   int64_t rowa0 = 0;
-   int64_t rowa1 = 0;
-   int64_t tau; 
-   int64_t step = 1;
-   int64_t window = 2;
-   int64_t gap = 1;
-   //====================================================================/
-
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-
-   // for Lyra2REv2, nCols = 4, v1 was using 8
-   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
-                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
-   uint64_t *ptrWord = wholeMatrix;
-
-   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
-                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-
-   uint64_t *ptr = wholeMatrix;
-   uint64_t *pw = (uint64_t*)pwd;
-
-   memcpy( ptr, pw, 2*pwdlen ); // password 
-   ptr += pwdlen>>2;
-   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
-   ptr += pwdlen>>2;
-
-   // now build the rest interleaving on the fly.
-
-   ptr[0] = ptr[ 4] = kLen;
-   ptr[1] = ptr[ 5] = pwdlen;
-   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
-   ptr[3] = ptr[ 7] = timeCost;
-   ptr[8] = ptr[12] = nRows;
-   ptr[9] = ptr[13] = nCols;
-   ptr[10] = ptr[14] = 0x80;
-   ptr[11] = ptr[15] = 0x0100000000000000;
-
-   ptrWord = wholeMatrix;
-
-   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
-
-   //Initializes M[0] and M[1]
-   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
-
-   reducedDuplexRow1_2way( state, &wholeMatrix[0],
-                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );
-
-   do
-   {
-     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-
-     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
-                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
-                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
-                                        nCols );
-
-     rowa0 = (rowa0 + step) & (window - 1);
-
-     prev = row;
-     row++;
-
-     if ( rowa0 == 0 )
-     {
-        step = window + gap;
-        window *= 2; 
-        gap = -gap;
-     }
-   } while ( row < nRows );
-
-   //===================== Wandering Phase =============================//
-   row = 0;
-   for ( tau = 1; tau <= timeCost; tau++ )
-   {
-      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
-      do
-      {
-        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
-        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
-
-        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
-                                      nCols );
-         prev = row;
-
-         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-
-      } while (row != 0);
-   }
-
-   //===================== Wrap-up Phase ===============================//
-   //Absorbs the last block of the memory matrix
-   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
-                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
-   //Squeezes the key
-   squeeze_2way( state, K, (unsigned int) kLen );
-
-   return 0;
-}
-
-// This version is currently only used by REv3 and has some hard coding
-// specific to v3 such as input data size of 32 bytes.
-//
-// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
-// they can be merged.
-//
-// RE is used by RE, allium. The main difference between RE and REv2
-// in the matrix size.
-//
-// Z also needs to support 80 byte input as well as 32 byte, and odd
-// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
-
-
-/////////////////////////////////////////////////
-
-// 2 way 256
-// drop salt, salt len arguments, hard code some others.
-// Data is interleaved 2x256.
-
-int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
-                    uint64_t nRows, uint64_t nCols )
-
-// hard coded for 32 byte input as well as matrix size.
-// Other required versions include 80 byte input and different block
-// sizez
-
-//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-//      const void *pwd, const uint64_t pwdlen, const void *salt,
-//      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
-//      const uint64_t nCols )
-{
-   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[32];
-   int64_t row = 2; 
-   int64_t prev = 1;
-   int64_t rowa0 = 0;
-   int64_t rowa1 = 0;
-   int64_t tau; 
-   int64_t step = 1;
-   int64_t window = 2;
-   int64_t gap = 1; 
-   uint64_t instance0 = 0;
-   uint64_t instance1 = 0;
-   //====================================================================/
-
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
-
-   uint64_t *ptrWord = wholeMatrix;
-
-//  2 way 256 rewrite. Salt always == password, and data is interleaved,
-//  need to build in parallel as pw isalready interleaved.
-
-   
-//  {   password,    (64 or 80 bytes)
-//      salt,        (64 or 80 bytes) =  same as password
-//      Klen,        (u64)  = 32 bytes
-//      pwdlen,      (u64)
-//      saltlen,     (u64)
-//      timecost,    (u64)
-//      nrows,       (u64)
-//      ncols,       (u64)
-//      0x80,        (byte)
-//      { 0 .. 0 },
-//      1            (byte)
-//   }
-   
-// input is usually 32 maybe 64, both are aligned to 256 bit vector.
-// 80 byte inpput is not aligned complicating matters for lyra2z.   
-
-   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
-                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-   
-   uint64_t *ptr = wholeMatrix;
-   uint64_t *pw = (uint64_t*)pwd;
-
-   memcpy( ptr, pw, 2*pwdlen ); // password 
-   ptr += pwdlen>>2;
-   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
-   ptr += pwdlen>>2;
- 
-   // now build the rest interleaving on the fly.
-
-   ptr[0] = ptr[ 4] = kLen;
-   ptr[1] = ptr[ 5] = pwdlen;
-   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
-   ptr[3] = ptr[ 7] = timeCost;
-   ptr[8] = ptr[12] = nRows;
-   ptr[9] = ptr[13] = nCols;
-   ptr[10] = ptr[14] = 0x80;
-   ptr[11] = ptr[15] = 0x0100000000000000;
-
-   ptrWord = wholeMatrix;
-
-   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
-
-   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
-
-   reducedDuplexRow1_2way( state, &wholeMatrix[0],
-                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );
-
-   do
-   {
-
-      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
-                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
-                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
-                                         nCols );
-
-      rowa0 = (rowa0 + step) & (window - 1);
-
-      prev = row;
-      row++;
-
-      if (rowa0 == 0)
-      {
-         step = window + gap; //changes the step: approximately doubles its value
-         window *= 2; //doubles the size of the re-visitation window
-         gap = -gap; //inverts the modifier to the step
-      }
-
-   } while (row < nRows);
-
-   row = 0;
-   for (tau = 1; tau <= timeCost; tau++)
-   {
-      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
-      do
-      {
-        instance0 = state[ offset_to_index( instance0 ) ];
-        instance1 = (&state[4])[ offset_to_index( instance1 ) ];
-
-        rowa0 = state[ offset_to_index( instance0 )  ]
-                & (unsigned int)(nRows-1);
-        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
-                & (unsigned int)(nRows-1);
-
-        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
-                                      nCols );
-
-        prev = row;
-        row = (row + step) & (unsigned int)(nRows-1); 
-
-       } while ( row != 0 );
-   }
-
-   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
-                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
-
-   squeeze_2way( state, K, (unsigned int) kLen );
-
-   return 0;
-}
-
-#endif // AVX512
-
-#if 0
-
-//////////////////////////////////////////////////
-int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-            const uint64_t timeCost, const uint64_t nRows,
-            const uint64_t nCols )
-{
-    //========================== Basic variables ============================//
-    uint64_t _ALIGN(256) state[16];
-    int64_t row = 2; //index of row to be processed
-    int64_t prev = 1; //index of prev (last row ever computed/modified)
-    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-    int64_t tau; //Time Loop iterator
-    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//    int64_t i; //auxiliary iteration counter
-    //=======================================================================/
-
-    //======= Initializing the Memory Matrix and pointers to it =============//
-    //Tries to allocate enough space for the whole memory matrix
-
-    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-    //==== Getting the password + salt + basil padded with 10*1 ============//
-    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-    //but this ensures that the password copied locally will be overwritten as soon as possible
-
-    //First, we clean enough blocks for the password, salt, basil and padding
-    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
-                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-    byte *ptrByte = (byte*) wholeMatrix;
-    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
-
-    //Prepends the password
-    memcpy(ptrByte, pwd, pwdlen);
-    ptrByte += pwdlen;
-
-    //Concatenates the salt
-    memcpy(ptrByte, salt, saltlen);
-    ptrByte += saltlen;
-    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-    memcpy(ptrByte, &kLen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nRows, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nCols, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-
-    //Now comes the padding
-    *ptrByte = 0x80; //first byte of padding: right after the password
-    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-    //=================== Initializing the Sponge State ====================//
-    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
-//        if (state == NULL) {
-//                return -1;
-//        }
-//    initState( state );
-
-    //============================== Setup Phase =============================//
-    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-    uint64_t *ptrWord = wholeMatrix;
-
-    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
-                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
-/*
-    for ( i = 0; i < nBlocksInput; i++ )
-    {
-      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
-    }
-*/
-    //Initializes M[0] and M[1]
-        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
-        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
-
-        do {
-                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
-
-                //updates the value of row* (deterministically picked during Setup))
-                rowa = (rowa + step) & (window - 1);
-                //update prev: it now points to the last row ever computed
-                prev = row;
-                //updates row: goes to the next row to be computed
-                row++;
-
-                //Checks if all rows in the window where visited.
-                if (rowa == 0) {
-                        step = window + gap; //changes the step: approximately doubles its value
-                        window *= 2; //doubles the size of the re-visitation window
-                        gap = -gap; //inverts the modifier to the step
-                }
-
-        } while (row < nRows);
-
-    //======================== Wandering Phase =============================//
-    row = 0; //Resets the visitation to the first row of the memory matrix
-    for ( tau = 1; tau <= timeCost; tau++ )
-    {
-        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-        do {
-        //Selects a pseudorandom index row*
-        //----------------------------------------------------------------------
-        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //-----------------------------------------------------------------
-
-        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
-
-        //update prev: it now points to the last row ever computed
-        prev = row;
-
-        //updates row: goes to the next row to be computed
-        //---------------------------------------------------------------
-        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
-        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //--------------------------------------------------------------------
-
-      } while (row != 0);
-    }
-
-    //========================= Wrap-up Phase ===============================//
-    //Absorbs the last block of the memory matrix
-    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
-
-    //Squeezes the key
-    squeeze( state, K, kLen );
-
-    return 0;
-}
-
-#endif
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
-                  const uint64_t pwdlen, const uint64_t timeCost,
-                  const uint64_t nRows, const uint64_t nCols )
-{
-   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa0 = 0;
-   int64_t rowa1 = 0;
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-   int64_t i; //auxiliary iteration counter
-   //====================================================================/
-
-   //=== Initializing the Memory Matrix and pointers to it =============//
-   //Tries to allocate enough space for the whole memory matrix
-
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-   // for Lyra2REv2, nCols = 4, v1 was using 8
-   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
-                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
-
-   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
-   if (wholeMatrix == NULL)
-      return -1;
-
-#if defined(__AVX2__)
-   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE2__)
-   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
-#else
-   memset( wholeMatrix, 0, i );
-#endif
-
-   uint64_t *ptrWord = wholeMatrix;
-   uint64_t *pw = (uint64_t*)pwd;
-
-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
-
-   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
-                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-
-   uint64_t *ptr = wholeMatrix;
-
-   memcpy( ptr, pw, 2*pwdlen ); // password 
-   ptr += pwdlen>>2;
-   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
-   ptr += pwdlen>>2;
-
-   // now build the rest interleaving on the fly.
-
-   ptr[0] = ptr[ 4] = kLen;
-   ptr[1] = ptr[ 5] = pwdlen;
-   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
-   ptr[3] = ptr[ 7] = timeCost;
-   ptr[8] = ptr[12] = nRows;
-   ptr[9] = ptr[13] = nCols;
-   ptr[10] = ptr[14] = 0x80;
-   ptr[11] = ptr[15] = 0x0100000000000000;
-
-   
-/*   
-   byte *ptrByte = (byte*) wholeMatrix;
-
-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
-
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
-
-//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-//                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-
-   ptrWord = wholeMatrix;
-
-*/
-
-   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
-   //Initializes M[0] and M[1]
-   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
-
-   reducedDuplexRow1_2way( state, &wholeMatrix[0],
-                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
-
-   do
-   {
-      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-
-      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
-                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
-                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
-                                         nCols );
-
-      //updates the value of row* (deterministically picked during Setup))
-      rowa0 = (rowa0 + step) & (window - 1);
-      //update prev: it now points to the last row ever computed
-
-      prev = row;
-      //updates row: goes to the next row to be computed
-      row++;
-
-      //Checks if all rows in the window where visited.
-      if (rowa0 == 0)
-      {
-         step = window + gap; //changes the step: approximately doubles its value
-         window *= 2; //doubles the size of the re-visitation window
-         gap = -gap; //inverts the modifier to the step
-      }
-
-   } while (row < nRows);
-
-   //===================== Wandering Phase =============================//
-   row = 0; //Resets the visitation to the first row of the memory matrix
-   for (tau = 1; tau <= timeCost; tau++)
-   {
-      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
-      do
-      {
-        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
-        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
-
-        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
-                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
-                                      nCols );
-
-           //update prev: it now points to the last row ever computed
-           prev = row;
-
-           //updates row: goes to the next row to be computed
-           //----------------------------------------------------
-           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //----------------------------------------------------
-
-       } while (row != 0);
-   }
-
-   //===================== Wrap-up Phase ===============================//
-   //Absorbs the last block of the memory matrix
-   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
-                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
-   //Squeezes the key
-   squeeze_2way( state, K, (unsigned int) kLen );
-
-   //================== Freeing the memory =============================//
-   _mm_free(wholeMatrix);
-
-   return 0;
-}
-
-#endif
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,6 +327,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
+
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -62,12 +62,12 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-
-int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
-        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
-
 int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
-        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+               uint64_t pwdlen, const void *salt, uint64_t saltlen,
+               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+//int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+//        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

 #endif

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,150 +1,13 @@
 #include "lyra2-gate.h"
 #include <memory.h>
+
+#if defined (LYRA2REV2_4WAY)	
+
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
-#include "algo/cubehash/cube-hash-2way.h"
-
-#if defined (LYRA2REV2_8WAY)
-
-typedef struct {
-   blake256_8way_context     blake;
-   keccak256_8way_context    keccak;
-   cube_4way_context          cube;
-   skein256_8way_context     skein;
-   bmw256_8way_context          bmw;
-} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
-
-static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
-
-bool init_lyra2rev2_8way_ctx()
-{
-   keccak256_8way_init( &l2v2_8way_ctx.keccak );
-   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_8way_ctx.skein );
-   bmw256_8way_init( &l2v2_8way_ctx.bmw );
-   return true;
-}
-
-void lyra2rev2_8way_hash( void *state, const void *input )
-{
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
-   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
-
-   blake256_8way( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
-
-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
-
-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
-
-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
-
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
-
-   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
-   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
-
-   intrlv_2x256( vhash, hash0, hash1, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash0, hash1, vhash, 256 );
-   intrlv_2x256( vhash, hash2, hash3, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash2, hash3, vhash, 256 );
-   intrlv_2x256( vhash, hash4, hash5, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash4, hash5, vhash, 256 );
-   intrlv_2x256( vhash, hash6, hash7, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash6, hash7, vhash, 256 );
-
-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                hash7, 256 );
-
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
-
-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
-
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
-   
-   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
-   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
-
-   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
-                hash7, 256 );
-
-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
-}
-
-int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id; 
-
-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-
-   blake256_8way_init( &l2v2_8way_ctx.blake );
-   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
-
-   do
-   {
-      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                  n+3, n+2, n+1, n ) );
-
-      lyra2rev2_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
-      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-         {
-            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      n += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined (LYRA2REV2_4WAY)

 typedef struct {
   blake256_4way_context     blake;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,180 +4,8 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
-#include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV3_16WAY)
-
-typedef struct {
-   blake256_16way_context     blake;
-   cube_4way_context          cube;
-   bmw256_16way_context       bmw;
-} lyra2v3_16way_ctx_holder;
-
-static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
-
-bool init_lyra2rev3_16way_ctx()
-{
-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
-   bmw256_16way_init( &l2v3_16way_ctx.bmw );
-   return true;
-}
-
-void lyra2rev3_16way_hash( void *state, const void *input )
-{
-   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   uint32_t hash8[8] __attribute__ ((aligned (64)));
-   uint32_t hash9[8] __attribute__ ((aligned (64)));
-   uint32_t hash10[8] __attribute__ ((aligned (64)));
-   uint32_t hash11[8] __attribute__ ((aligned (64)));
-   uint32_t hash12[8] __attribute__ ((aligned (64)));
-   uint32_t hash13[8] __attribute__ ((aligned (64)));
-   uint32_t hash14[8] __attribute__ ((aligned (64)));
-   uint32_t hash15[8] __attribute__ ((aligned (64)));
-   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
-   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
-
-   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
-
-   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
-           vhash, 256 );
-
-   intrlv_2x256( vhash, hash0, hash1, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash0, hash1, vhash, 256 );
-   intrlv_2x256( vhash, hash2, hash3, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash2, hash3, vhash, 256 );
-   intrlv_2x256( vhash, hash4, hash5, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash4, hash5, vhash, 256 );
-   intrlv_2x256( vhash, hash6, hash7, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash6, hash7, vhash, 256 );
-   intrlv_2x256( vhash, hash8, hash9, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash8, hash9, vhash, 256 );
-   intrlv_2x256( vhash, hash10, hash11, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash10, hash11, vhash, 256 );
-   intrlv_2x256( vhash, hash12, hash13, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash12, hash13, vhash, 256 );
-   intrlv_2x256( vhash, hash14, hash15, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash14, hash15, vhash, 256 );
-
-   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
-   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
-   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
-   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
-   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
-   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
-   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
-   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
-
-   intrlv_2x256( vhash, hash0, hash1, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash0, hash1, vhash, 256 );
-   intrlv_2x256( vhash, hash2, hash3, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash2, hash3, vhash, 256 );
-   intrlv_2x256( vhash, hash4, hash5, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash4, hash5, vhash, 256 );
-   intrlv_2x256( vhash, hash6, hash7, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash6, hash7, vhash, 256 );
-   intrlv_2x256( vhash, hash8, hash9, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash8, hash9, vhash, 256 );
-   intrlv_2x256( vhash, hash10, hash11, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash10, hash11, vhash, 256 );
-   intrlv_2x256( vhash, hash12, hash13, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash12, hash13, vhash, 256 );
-   intrlv_2x256( vhash, hash14, hash15, 256 );
-   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash14, hash15, vhash, 256 );
-
-   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
-             hash15, 256 );
-
-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
-}
-
-
-int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &hash[7<<4];
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 16;
-   const uint32_t Htarg = ptarget[7];
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;
-
-   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
-
-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-
-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
-
-   do
-   {
-      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
-                                                  n+11, n+10, n+ 9, n+ 8,
-                                                  n+ 7, n+ 6, n+ 5, n+ 4,
-                                                  n+ 3, n+ 2, n+ 1, n ) );
-
-      lyra2rev3_16way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash7[lane] <= Htarg ) )
-      {
-         extr_lane_16x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
-         {
-             pdata[19] = n + lane;
-             submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_8WAY)

 typedef struct {
   blake256_8way_context     blake;
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-//#include "algo-gate.h"
+#include "algo-gate-api.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -27,7 +27,8 @@
 #include "sponge.h"
 #include "lyra2.h"

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if 0
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
@@ -40,26 +41,19 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_512( out, state, BLOCK_LEN_M256I );
-       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I;
+       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
+       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I*2;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
+    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
 }

-inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
-                                               const uint64_t *In1 ) 
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
 {
    register __m512i state0, state1, state2, state3;
-    __m512i in[3];
-    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
-    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
-    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
-    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
-    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
-    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
-    
+    __m512i *in = (__m512i*)In;
+
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
    state2 = _mm512_load_si512( (__m512i*)State + 2 );
@@ -97,7 +91,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
    state1 = _mm512_xor_si512( state1, in[1] );

    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len*2;
+    In += block_len * 2;
  }

  _mm512_store_si512( (__m512i*)State,     state0 );
@@ -116,7 +110,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,


    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -133,13 +127,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-
+                   
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;

       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I;
+       out -= BLOCK_LEN_M256I * 2;

       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -150,14 +144,15 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+// This function has to deal with gathering 2 256 bit rowin vectors from
+// non-contiguous memory. Extra work and performance penalty.

 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                 uint64_t *rowOut, uint64_t nCols )
 {
    int i;
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m512i*)rowIn;
-    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i *in = (__m256i*)rowIn;

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -177,25 +172,28 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );

         //Input: next column (i.e., next block in sequence)
-         in += BLOCK_LEN_M256I;
+         in0 += BLOCK_LEN_M256I;
+         in1 += BLOCK_LEN_M256I;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I;
+         out -= BLOCK_LEN_M256I * 2;
    }

-    _mm512_store_si512( (__m512i*)State,     state0 );
-    _mm512_store_si512( (__m512i*)State + 1, state1 );
-    _mm512_store_si512( (__m512i*)State + 2, state2 );
-    _mm512_store_si512( (__m512i*)State + 3, state3 );
+    _mm512_store_si256( (__m512i*)State,     state0 );
+    _mm512_store_si256( (__m512i*)State + 1, state1 );
+    _mm512_store_si256( (__m512i*)State + 2, state2 );
+    _mm512_store_si256( (__m512i*)State + 3, state3 );
+   }
 }

 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
 {
    int i;
+
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
@@ -212,7 +210,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       state2 = _mm512_xor_si512( state2,
                                  _mm512_add_epi64( in[2], inout[2] ) );

-       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );

       out[0] = _mm512_xor_si512( state0, in[0] );
       out[1] = _mm512_xor_si512( state1, in[1] );
@@ -224,18 +222,17 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       t2 = _mm512_permutex_epi64( state2, 0x93 );

       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
-
+                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );

       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I;
-       inout += BLOCK_LEN_M256I;
+       in    += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I;
+       out   -= BLOCK_LEN_M256I * 2;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -244,61 +241,49 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-// big ugly workaound for pointer aliasing, use a union of pointers.
-// Access matrix using m512i for in and out, m256i for inout
-
-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
-                            uint64_t *rowOut, uint64_t nCols)
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
+                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
+                uint64_t nCols )
 {
   int i;
-   register __m512i state0, state1, state2, state3;
-   __m512i *in = (__m512i*)rowIn;
-   __m256i *inout0 = (__m256i*)rowInOut0;
-   __m256i *inout1 = (__m256i*)rowInOut1;
-   __m512i *out = (__m512i*)rowOut;
-   __m512i io[3];
-   povly inout;
-   inout.v512 = &io[0];
-    __m512i t0, t1, t2;

+   register __m512i state0, state1, state2, state3;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m256i *in0 = (__m256i*)rowIn0;
+    __m2512* in    = (__m512i*)rowIn;
+    __m2512* inout = (__m512i*)rowInOut;
+    __m512i* out   = (__m512i*)rowOut;
+    __m512i  t0, t1, t2;
+
+    _mm_prefetch( in0,     _MM_HINT_T0 );
+    _mm_prefetch( in1,     _MM_HINT_T0 );
+    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
+   
   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
-    
-    _mm_prefetch( in,     _MM_HINT_T0 );
-    _mm_prefetch( inout0,     _MM_HINT_T0 );
-    _mm_prefetch( inout1,     _MM_HINT_T0 );
-    _mm_prefetch( in     + 2, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in     + 4, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in     + 6, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
-
-    
-    for ( i = 0; i < nCols; i++ )
-    {

      //Absorbing "M[prev] [+] M[row*]"
-      inout.v256[0] = inout0[0];
-      inout.v256[1] = inout1[1];
-      inout.v256[2] = inout0[2];
-      inout.v256[3] = inout1[3];
-      inout.v256[4] = inout0[4];
-      inout.v256[5] = inout1[5];

+//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
+//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
+//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
+      t0 = mm512_concat_256( in1[0], in0[0] );
+      t1 = mm512_concat_256( in1[1], in0[1] );
+      t2 = mm512_concat_256( in1[2], in0[2] );
+      
      state0 = _mm512_xor_si512( state0,
-                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
+                                     _mm512_add_epi64( t0, inout[0] ) );
      state1 = _mm512_xor_si512( state1,
-                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
+                                     _mm512_add_epi64( t1, inout[1] ) );
      state2 = _mm512_xor_si512( state2,
-                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
-
+                                     _mm512_add_epi64( t2, inout[2] ) );

      //Applies the reduced-round transformation f to the sponge's state
      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -308,44 +293,22 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
      out[1] = _mm512_xor_si512( out[1], state1 );
      out[2] = _mm512_xor_si512( out[2], state2 );

-      // if inout is the same row as out it was just overwritten, reload.
-      if ( rowOut == rowInOut0 )
-      {
-         inout.v256[0] = inout0[0];
-         inout.v256[2] = inout0[2];
-         inout.v256[4] = inout0[4];
-      }
-      if ( rowOut == rowInOut1 )
-      {
-         inout.v256[1] = inout1[1];
-         inout.v256[3] = inout1[3];
-         inout.v256[5] = inout1[5];
-      }
-
      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
      t0 = _mm512_permutex_epi64( state0, 0x93 );
      t1 = _mm512_permutex_epi64( state1, 0x93 );
      t2 = _mm512_permutex_epi64( state2, 0x93 );

-      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
-                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
-      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
-                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
-      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
-                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
-      
-      inout0[0] = inout.v256[0];
-      inout1[1] = inout.v256[1];
-      inout0[2] = inout.v256[2];
-      inout1[3] = inout.v256[3];
-      inout0[4] = inout.v256[4];
-      inout1[5] = inout.v256[5];
+      inout[0] = _mm512_xor_si512( inout[0],
+                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+      inout[1] = _mm512_xor_si512( inout[1],
+                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+      inout[2] = _mm512_xor_si512( inout[2],
+                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );

       //Goes to next block
-       in     += BLOCK_LEN_M256I;
-       inout0 += BLOCK_LEN_M256I * 2;
-       inout1 += BLOCK_LEN_M256I * 2;
-       out    += BLOCK_LEN_M256I;
+       in    += BLOCK_LEN_M256I * 2;
+       out   += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I * 2;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -375,10 +375,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-
-//printf("S RSR0 col= %d, out= %x\n",i,out);
-
-
+                   
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;
@@ -709,34 +706,11 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm256_xor_si256( state1, in[1] );
       out[2] = _mm256_xor_si256( state2, in[2] );

-/*
-printf("s duplexsetup col= %d\n",i); 
-uint64_t * o = (uint64_t*)out;
-printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
-printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
-printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
-printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
-printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
-printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
-*/
-
       //M[row*][col] = M[row*][col] XOR rotW(rand)
       t0 = _mm256_permute4x64_epi64( state0, 0x93 );
       t1 = _mm256_permute4x64_epi64( state1, 0x93 );
       t2 = _mm256_permute4x64_epi64( state2, 0x93 );

-/*
-uint64_t *t = (uint64_t*)&t0;
-printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
-
-o = (uint64_t*)inout;
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
-printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
-*/       
       inout[0] = _mm256_xor_si256( inout[0],
                                    _mm256_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm256_xor_si256( inout[1],
@@ -744,17 +718,7 @@ printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
       inout[2] = _mm256_xor_si256( inout[2],
                                    _mm256_blend_epi32( t2, t1, 0x03 ) );

-/*
-o = (uint64_t*)inout;
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
-printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
-*/
-
-//Inputs: next column (i.e., next block in sequence)
+       //Inputs: next column (i.e., next block in sequence)
       in    += BLOCK_LEN_M256I;
       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
@@ -985,22 +949,6 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
      _mm_prefetch( inout +  9, _MM_HINT_T0 );
      _mm_prefetch( inout + 11, _MM_HINT_T0 );

-/*
-uint64_t *io = (uint64_t*)inout;
-uint64_t *ii = (uint64_t*)in;
-
-printf("RDRS1 col= %d\n", i);
-printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
-printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
-printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
-printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
-printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
-printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
-printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
-printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
-*/
-
-
      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
                                     _mm256_add_epi64( in[0], inout[0] ) );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,36 +203,24 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-union _povly
-{
-   __m512i *v512;
-   __m256i *v256;
-   uint64_t *u64;
-};
-typedef union _povly povly;
-
 //---- Housekeeping
-void initState_2way( uint64_t State[/*16*/] );
+void initState_2way( uint64_t state[/*16*/] );

 //---- Squeezes
-void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
+void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
 void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );

 //---- Absorbs
-void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
-                       const uint64_t *In1 );
-void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
+void absorbBlock_2way( uint64_t *state, const uint64_t *in );
+void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
                            const uint64_t nBlocks, const uint64_t block_len );

 //---- Duplexes
-void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
+void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
                             uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
+void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-
-void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
-                            uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);

 #endif

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,10 +2,7 @@

 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_8WAY)
-  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
-  gate->hash      = (void*)&hmq1725_8way_hash;
-#elif defined(HMQ1725_4WAY)
+#if defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -13,7 +10,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,21 +4,13 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define HMQ1725_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define HMQ1725_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+//  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );

-#if defined(HMQ1725_8WAY)
-
-void hmq1725_8way_hash( void *state, const void *input );
-int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(HMQ1725_4WAY)
+#if defined(HMQ1725_4WAY)

 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,7 +333,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -347,7 +346,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -361,7 +359,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -375,7 +372,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -390,7 +386,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -404,7 +399,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
-            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -63,6 +63,20 @@ void quark_8way_hash( void *state, const void *input )
    bmw512_8way_update( &ctx.bmw, vhash, 64 );
    bmw512_8way_close( &ctx.bmw, vhash );

+// AVX 512 cmpeq returns a bit mask instead of a vector mask.
+// This should simplify things but the logic doesn't seem to be working.
+// The problem appears to be related to the test to skip a hash if it isn't
+// to be used. Skipping the test for all 8 way hashes seems to have
+// fixed it. The hash selection blending works if the hash is produced
+// but the hash wasn't being produced when it should.
+// Both decisions are based on the same data, the __mmask8. It works
+// as a blend mask but not in a logical comparison, maybe the type is the
+// problem. Maybe a cast to int or movm is needed to make it work.
+// It's now moot because the hash can only be skipped 1 in 256 iterations
+// when hashing parallel 8 ways.
+// The performance impact of the workaround should be negligible.
+// It's a problem for another day.
+
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context __attribute__ ((aligned (64)));
+} sha256_4way_context;

 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context __attribute__ ((aligned (128)));
+} sha256_8way_context;

 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,32 +86,30 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context __attribute__ ((aligned (128)));
+} sha512_4way_context;

 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way_update( sha512_4way_context *sc, const void *data,
-                         size_t len );
-#define sha512_4way sha512_4way_update
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-512 8 way
-
+// SHA-256 11 way hybrid
+// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
 typedef struct {
-   __m512i buf[128>>3];
-   __m512i val[8];
-   uint64_t count;
-   bool initialized;
-} sha512_8way_context __attribute__ ((aligned (128)));
+   __m256i  bufx[64>>2];
+   __m256i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_11way_context;

-void sha512_8way_init( sha512_8way_context *sc);
-void sha512_8way_update( sha512_8way_context *sc, const void *data, 
-                         size_t len );
-void sha512_8way_close( sha512_8way_context *sc, void *dst );
+void sha256_11way_init( sha256_11way_context *ctx );
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                 const void *datay, const void *dataz, size_t len );
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
+	                 void *dstz  );

-
-#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,6 +36,8 @@
 #include <string.h>
 #include "sha-hash-4way.h"

+// SHA-512 4 way 64 bit
+
 /*
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -88,236 +90,6 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };

-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// SHA-512 8 way 64 bit
-
-#define CH8W(X, Y, Z) \
-   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
-
-#define MAJ8W(X, Y, Z) \
-   _mm512_or_si512( _mm512_and_si512( X, Y ), \
-                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
-
-#define BSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
-
-#define BSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
-
-#define SSG8W_5_0(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
-
-#define SSG8W_5_1(x) \
-   _mm512_xor_si512( _mm512_xor_si512( \
-        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
-
-static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
-{
-   __m512i w0a, w1a, w0b, w1b;
-   w0a = mm512_ror_64( w0, 1 );
-   w1a = mm512_ror_64( w1,19 );
-   w0b = mm512_ror_64( w0, 8 );
-   w1b = mm512_ror_64( w1,61 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   w0b = _mm512_srli_epi64( w0, 7 );
-   w1b = _mm512_srli_epi64( w1, 6 );
-   w0a = _mm512_xor_si512( w0a, w0b );
-   w1a = _mm512_xor_si512( w1a, w1b );
-   return _mm512_add_epi64( w0a, w1a );
-}
-
-
-#define SSG8W_512x2_0( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-15], 1 ); \
-  X1a = mm512_ror_64( W[i-14], 1 ); \
-  X0b = mm512_ror_64( W[i-15], 8 ); \
-  X1b = mm512_ror_64( W[i-14], 8 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
-  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
-
-#define SSG8W_512x2_1( w0, w1, i ) do \
-{ \
-   __m512i X0a, X1a, X0b, X1b; \
-  X0a = mm512_ror_64( W[i-2],19 ); \
-  X1a = mm512_ror_64( W[i-1],19 ); \
-  X0b = mm512_ror_64( W[i-2],61 ); \
-  X1b = mm512_ror_64( W[i-1],61 ); \
-  X0a = _mm512_xor_si512( X0a, X0b ); \
-  X1a = _mm512_xor_si512( X1a, X1b ); \
-  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
-  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
-  w0  = _mm512_xor_si512( X0a, X0b ); \
-  w1  = _mm512_xor_si512( X1a, X1b ); \
-} while(0)
-
-#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m512i T1, T2; \
-  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
-  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
-  D  = _mm512_add_epi64( D, T1 ); \
-  H  = _mm512_add_epi64( T1, T2 ); \
-} while (0)
-
-static void
-sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
-{
-   int i;
-   register __m512i A, B, C, D, E, F, G, H;
-   __m512i W[80];
-
-   mm512_block_bswap_64( W  , in );
-   mm512_block_bswap_64( W+8, in+8 );
-
-   for ( i = 16; i < 80; i++ )
-      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
-                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m512_const1_64( 0x6A09E667F3BCC908 );
-      B = m512_const1_64( 0xBB67AE8584CAA73B );
-      C = m512_const1_64( 0x3C6EF372FE94F82B );
-      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
-      E = m512_const1_64( 0x510E527FADE682D1 );
-      F = m512_const1_64( 0x9B05688C2B3E6C1F );
-      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
-      H = m512_const1_64( 0x5BE0CD19137E2179 );
-   }
-
-   for ( i = 0; i < 80; i += 8 )
-   {
-      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
-      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
-      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
-      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
-      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
-      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
-      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
-      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm512_add_epi64( r[0], A );
-      r[1] = _mm512_add_epi64( r[1], B );
-      r[2] = _mm512_add_epi64( r[2], C );
-      r[3] = _mm512_add_epi64( r[3], D );
-      r[4] = _mm512_add_epi64( r[4], E );
-      r[5] = _mm512_add_epi64( r[5], F );
-      r[6] = _mm512_add_epi64( r[6], G );
-      r[7] = _mm512_add_epi64( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
-      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
-      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
-      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
-      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
-      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
-      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
-      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
-   }
-}
-
-void sha512_8way_init( sha512_8way_context *sc )
-{
-   sc->initialized = false;
-   sc->count = 0;
-}
-
-void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
-{
-   __m512i *vdata = (__m512i*)data;
-   size_t ptr;
-   const int buf_size = 128;
-
-   ptr = (unsigned)sc->count & (buf_size - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == buf_size )
-      {
-         sha512_8way_round( sc, sc->buf, sc->val );
-         ptr = 0;
-      }
-      sc->count += clen;
-   }
-}
-
-void sha512_8way_close( sha512_8way_context *sc, void *dst )
-{
-    unsigned ptr;
-    const int buf_size = 128;
-    const int pad = buf_size - 16;
-    const __m512i shuff_bswap64 = m512_const_64(
-                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
-                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
-                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
-
-    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
-    ptr += 8;
-    if ( ptr > pad )
-    {
-         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
-         sha512_8way_round( sc, sc->buf, sc->val );
-         memset_zero_512( sc->buf, pad >> 3 );
-    }
-    else
-         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
-
-    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
-                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
-                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
-    sha512_8way_round( sc, sc->buf, sc->val );
-
-    mm512_block_bswap_64( dst, sc->val );
-}
-
-
-#endif   // AVX512
-
-// SHA-512 4 way 64 bit
-
-
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -482,7 +254,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }

-void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,12 +3,6 @@

 #include <stdio.h>

-// This implementation is deprecated, superseded by VAES in Icelake
-// which provides HW based 4 way aes.
-// It was created for AVX2 to eliminate interleaving between the 
-// preceding and following function.
-// This code can be removed when current users have reverted to one way.
-
 #if defined(__AVX2__)


--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -51,8 +51,6 @@ void init_c11_8way_ctx()
 void c11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -109,18 +107,21 @@ void c11_8way_hash( void *state, const void *input )
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );

-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     // 7 Luffa + 8 cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -51,8 +51,6 @@ void init_x11_8way_ctx()
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -110,18 +108,20 @@ void x11_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );

-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -56,8 +56,6 @@ void init_x12_8way_ctx()
 void x12_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -75,18 +73,20 @@ void x12_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
-
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7, vhash );
+     
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -58,8 +58,6 @@ void init_x13_8way_ctx()
 void x13_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -115,18 +113,17 @@ void x13_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
-
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -62,8 +62,6 @@ void init_x14_8way_ctx()
 void x14_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -117,18 +115,20 @@ void x14_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );

-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-     
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -65,9 +65,6 @@ void init_x15_8way_ctx()

 void x15_8way_hash( void *state, const void *input )
 {
-     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -76,6 +73,7 @@ void x15_8way_hash( void *state, const void *input )
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*8] __attribute__ ((aligned (64)));
     x15_8way_ctx_holder ctx;
     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );

@@ -121,18 +119,17 @@ void x15_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
-
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     // Luffa + Cube
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );

     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,6 +5,9 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -17,7 +20,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -30,392 +32,6 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-#if defined (X16R_8WAY)
-
-union _x16r_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
-    simd_4way_context       simd;
-    hashState_echo          echo;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-} __attribute__ ((aligned (64)));
-
-typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
-
-void x16r_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
-   x16r_8way_context_overlay ctx;
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   void *in4 = (void*) hash4;
-   void *in5 = (void*) hash5;
-   void *in6 = (void*) hash6;
-   void *in7 = (void*) hash7;
-   int size = 80;
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 input, 640 );
-
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            blake512_8way_init( &ctx.blake );
-            if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
-                            size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
-            }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case BMW:
-            bmw512_8way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_8way_close( &ctx.bmw, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
-         break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case JH:
-            jh512_8way_init( &ctx.jh );
-            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
-                            size<<3 );
-               jh512_8way_update( &ctx.jh, vhash, size );
-            }
-            jh512_8way_close( &ctx.jh, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case KECCAK:
-            keccak512_8way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
-                            size<<3 );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_8way_close( &ctx.keccak, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case LUFFA:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
-         case SIMD:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
-         break;
-         case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-             break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
-         break;
-         case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
-         break;
-         case SHA_512:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, size );
-             sha512_8way_close( &ctx.sha512, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-      }
-      size = 64;
-   }
-
-   memcpy( output,     hash0, 32 );
-   memcpy( output+32,  hash1, 32 );
-   memcpy( output+64,  hash2, 32 );
-   memcpy( output+96,  hash3, 32 );
-   memcpy( output+128, hash4, 32 );
-   memcpy( output+160, hash5, 32 );
-   memcpy( output+192, hash6, 32 );
-   memcpy( output+224, hash7, 32 );
-}
-
-int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-
-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-   const uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
-   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
-   }
-
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-      x16r_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
-      {
-         pdata[19] = n+i;
-         submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-
-#elif defined (X16R_4WAY)
-
 union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -434,16 +50,16 @@ union _x16r_4way_context_overlay
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-} __attribute__ ((aligned (64)));
+};
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

 void x16r_4way_hash( void* output, const void* input )
 {
-   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
@@ -470,7 +86,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -482,7 +98,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -508,7 +124,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -520,7 +136,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -532,17 +148,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128_512( hash0, hash1, vhash );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128_512( hash2, hash3, vhash );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -576,11 +192,11 @@ void x16r_4way_hash( void* output, const void* input )
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128_512( hash0, hash1, vhash );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128_512( hash2, hash3, vhash );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -601,7 +217,7 @@ void x16r_4way_hash( void* output, const void* input )
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -622,7 +238,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
+             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -643,7 +259,7 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
      size = 64;
@@ -664,7 +280,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
@@ -702,9 +317,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < max_nonce ) && !(*restart) ) );

-   *hashes_done = n - first_nonce;
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,17 +34,14 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -52,17 +49,14 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
-  gate->hash      = (void*)&x16rv2_8way_hash;
-#elif defined (X16R_4WAY)
+#if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -70,17 +64,14 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16r_8way;
-  gate->hash      = (void*)&x16r_8way_hash;
-#elif defined (X16R_4WAY)
+#if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -205,34 +196,28 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16rt_8way;
-  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16R_4WAY)
+#if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_8WAY)
-  gate->scanhash  = (void*)&scanhash_x16rt_8way;
-  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16R_4WAY)
+#if defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -246,7 +231,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,10 +6,8 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X16R_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY
 #endif

 enum x16r_Algo {
@@ -46,20 +44,7 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );

-#if defined(X16R_8WAY)
-
-void x16r_8way_hash( void *state, const void *input );
-int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-void x16rv2_8way_hash( void *state, const void *input );
-int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-void x16rt_8way_hash( void *state, const void *input );
-int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(X16R_4WAY)
+#if defined(X16R_4WAY)

 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +58,12 @@ void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-#else
+void x21s_4way_hash( void *state, const void *input );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_4way_thread_init();
+
+#endif

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -87,16 +77,9 @@ void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

-#endif
-
-#if defined(X16R_4WAY)
-
-void x21s_4way_hash( void *state, const void *input );
-int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-bool x21s_4way_thread_init();
-
-#else
+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );

 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -105,9 +88,3 @@ bool x21s_thread_init();

 #endif

-void hex_hash( void *state, const void *input );
-int scanhash_hex( struct work *work, uint32_t max_nonce,
-                  uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,4 +1,7 @@
 #include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -12,7 +15,6 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -24,391 +26,6 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-#if defined (X16R_8WAY)
-
-union _x16rt_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
-    simd_4way_context       simd;
-    hashState_echo          echo;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-} __attribute__ ((aligned (64)));
-
-typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
-
-void x16rt_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
-   x16rt_8way_context_overlay ctx;
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   void *in4 = (void*) hash4;
-   void *in5 = (void*) hash5;
-   void *in6 = (void*) hash6;
-   void *in7 = (void*) hash7;
-   int size = 80;
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 input, 640 );
-
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            blake512_8way_init( &ctx.blake );
-            if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
-            }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case BMW:
-            bmw512_8way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_8way_close( &ctx.bmw, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
-         break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case JH:
-            jh512_8way_init( &ctx.jh );
-            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               jh512_8way_update( &ctx.jh, vhash, size );
-            }
-            jh512_8way_close( &ctx.jh, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case KECCAK:
-            keccak512_8way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_8way_close( &ctx.keccak, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case LUFFA:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
-         case SIMD:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
-         break;
-         case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-             break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
-         break;
-         case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
-         break;
-         case SHA_512:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, size );
-             sha512_8way_close( &ctx.sha512, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-      }
-      size = 64;
-   }
-
-   memcpy( output,     hash0, 32 );
-   memcpy( output+32,  hash1, 32 );
-   memcpy( output+64,  hash2, 32 );
-   memcpy( output+96,  hash3, 32 );
-   memcpy( output+128, hash4, 32 );
-   memcpy( output+160, hash5, 32 );
-   memcpy( output+192, hash6, 32 );
-   memcpy( output+224, hash7, 32 );
-}
-
-int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) timeHash[8*8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
-   {
-      x16rt_getTimeHash( ntime, &timeHash );
-      x16rt_getAlgoString( &timeHash[0], hashOrder );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               hashOrder, ntime, timeHash );
-   }
-
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-      x16rt_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
-      {
-         pdata[19] = n+i;
-         submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined (X16R_4WAY)
-
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,6 +5,9 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -18,7 +21,6 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -31,477 +33,6 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

-#if defined (X16R_8WAY)
-
-union _x16rv2_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
-    simd_4way_context       simd;
-    hashState_echo          echo;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    sph_tiger_context       tiger;
-} __attribute__ ((aligned (64)));
-
-typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
-
-void x16rv2_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
-   x16rv2_8way_context_overlay ctx;
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   void *in4 = (void*) hash4;
-   void *in5 = (void*) hash5;
-   void *in6 = (void*) hash6;
-   void *in7 = (void*) hash7;
-   int size = 80;
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 input, 640 );
-
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            blake512_8way_init( &ctx.blake );
-            if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
-            }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case BMW:
-            bmw512_8way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_8way_close( &ctx.bmw, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
-         break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case JH:
-            jh512_8way_init( &ctx.jh );
-            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               jh512_8way_update( &ctx.jh, vhash, size );
-            }
-            jh512_8way_close( &ctx.jh, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case KECCAK:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in4, size );
-             sph_tiger_close( &ctx.tiger, hash4 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in5, size );
-             sph_tiger_close( &ctx.tiger, hash5 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in6, size );
-             sph_tiger_close( &ctx.tiger, hash6 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in7, size );
-             sph_tiger_close( &ctx.tiger, hash7 );
-
-             for ( int i = (24/4); i < (64/4); i++ )
-                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
-                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
-
-             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
-                          hash6, hash7 );
-             keccak512_8way_init( &ctx.keccak );
-             keccak512_8way_update( &ctx.keccak, vhash, 64 );
-             keccak512_8way_close( &ctx.keccak, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case LUFFA:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in4, size );
-             sph_tiger_close( &ctx.tiger, hash4 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in5, size );
-             sph_tiger_close( &ctx.tiger, hash5 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in6, size );
-             sph_tiger_close( &ctx.tiger, hash6 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in7, size );
-             sph_tiger_close( &ctx.tiger, hash7 );
-
-             for ( int i = (24/4); i < (64/4); i++ )
-                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
-                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
-
-            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
-         case SIMD:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
-         break;
-         case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-             break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
-         break;
-         case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
-         break;
-         case SHA_512:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in4, size );
-             sph_tiger_close( &ctx.tiger, hash4 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in5, size );
-             sph_tiger_close( &ctx.tiger, hash5 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in6, size );
-             sph_tiger_close( &ctx.tiger, hash6 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in7, size );
-             sph_tiger_close( &ctx.tiger, hash7 );
-
-             for ( int i = (24/4); i < (64/4); i++ )
-                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
-                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
-
-             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
-                          hash6, hash7 );
-             sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, 64 );
-             sha512_8way_close( &ctx.sha512, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-      }
-      size = 64;
-   }
-
-   memcpy( output,     hash0, 32 );
-   memcpy( output+32,  hash1, 32 );
-   memcpy( output+64,  hash2, 32 );
-   memcpy( output+96,  hash3, 32 );
-   memcpy( output+128, hash4, 32 );
-   memcpy( output+160, hash5, 32 );
-   memcpy( output+192, hash6, 32 );
-   memcpy( output+224, hash7, 32 );
-}
-
-int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr)
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-
-   bedata1[0] = bswap_32( pdata[1] );
-   bedata1[1] = bswap_32( pdata[2] );
-   const uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
-   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
-   }
-
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
-      x16rv2_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
-      {
-         pdata[19] = n+i;
-         submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-
-#elif defined (X16R_4WAY)
-
-
-
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -2,10 +2,8 @@

 bool register_sonoa_algo( algo_gate_t* gate )
 {
-#if defined (SONOA_8WAY)
-  gate->scanhash  = (void*)&scanhash_sonoa_8way;
-  gate->hash      = (void*)&sonoa_8way_hash;
-#elif defined (SONOA_4WAY)
+#if defined (SONOA_4WAY)
+//  init_sonoa_4way_ctx();
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
@@ -13,7 +11,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_sonoa;
  gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,33 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define SONOA_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define SONOA_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY
 #endif

 bool register_sonoa_algo( algo_gate_t* gate );

-#if defined(SONOA_8WAY)
-
-void sonoa_8way_hash( void *state, const void *input );
-int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(SONOA_4WAY)
+#if defined(SONOA_4WAY)

 void sonoa_4way_hash( void *state, const void *input );
+
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-#else
+//void init_sonoa_4way_ctx();
+
+#endif

 void sonoa_hash( void *state, const void *input );
+
 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
+
 void init_sonoa_ctx();

 #endif

-#endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1,4 +1,7 @@
 #include "x17-gate.h"
+
+#if defined(X17_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -11,7 +14,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -22,309 +24,6 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"

-#if defined(X17_8WAY)
-
-union _x17_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
-    simd_4way_context       simd;
-    hashState_echo          echo;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-    haval256_5_8way_context haval;
-} __attribute__ ((aligned (64)));
-typedef union _x17_8way_context_overlay x17_8way_context_overlay;
-
-void x17_8way_hash( void *state, const void *input )
-{
-     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
-     x17_8way_context_overlay ctx;
-
-     // 1 Blake parallel 4 way 64 bit
-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
-
-     // 2 Bmw
-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, 64 );
-     bmw512_8way_close( &ctx.bmw, vhash );
-
-     // Serialize
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
-
-     // 3 Groestl
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
-
-     // Parallellize
-     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7 );
-
-     // 4 Skein parallel 4 way 64 bit 
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, 64 );
-     skein512_8way_close( &ctx.skein, vhash );
-
-     // 5 JH
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, 64 );
-     jh512_8way_close( &ctx.jh, vhash );
-
-     // 6 Keccak
-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, 64 );
-     keccak512_8way_close( &ctx.keccak, vhash );
-
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
-
-     // 7 Luffa  
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
-
-     // 8 Cubehash
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
-
-     // 9 Shavite
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, 64 );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, 64 );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, 64 );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, 64 );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, 64 );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, 64 );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, 64 );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, 64 );
-     sph_shavite512_close( &ctx.shavite, hash7 );
-
-     // 10 Simd
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-
-
-     // 11 Echo serial
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                            (const BitSequence *) hash0, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                            (const BitSequence *) hash1, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                            (const BitSequence *) hash2, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                            (const BitSequence *) hash3, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                            (const BitSequence *) hash4, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                            (const BitSequence *) hash5, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                            (const BitSequence *) hash6, 512 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                            (const BitSequence *) hash7, 512 );
-
-     // 12 Hamsi parallel 4 way 64 bit
-     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                      hash7 );
-
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
-
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                       vhash );
-
-     // 13 Fugue serial
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, 64 );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, 64 );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, 64 );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, 64 );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, 64 );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, 64 );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, 64 );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, 64 );
-     sph_fugue512_close( &ctx.fugue, hash7 );
-
-     // 14 Shabal, parallel 4 way 32 bit
-     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                      hash7 );
-
-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, 64 );
-     shabal512_8way_close( &ctx.shabal, vhash );
-
-     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                       vhash );
-
-     // 15 Whirlpool serial
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
-
-     // 16 SHA512 parallel 64 bit 
-     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                      hash7 );
-
-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, 64 );
-     sha512_8way_close( &ctx.sha512, vhash );
-
-     // 17 Haval parallel 32 bit
-     rintrlv_8x64_8x32( vhash0, vhash,  512 );
-
-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
-     haval256_5_8way_close( &ctx.haval, state );
-}
-
-int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-     const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-      x17_8way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
-      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
-         {
-            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(X17_4WAY)
-
 union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -2,17 +2,14 @@

 bool register_x17_algo( algo_gate_t* gate )
 {
-#if defined (X17_8WAY)
-  gate->scanhash  = (void*)&scanhash_x17_8way;
-  gate->hash      = (void*)&x17_8way_hash;
-#elif defined (X17_4WAY)
+#if defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -4,20 +4,13 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X17_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X17_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define X17_4WAY
 #endif

 bool register_x17_algo( algo_gate_t* gate );

-#if defined(X17_8WAY)
-
-void x17_8way_hash( void *state, const void *input );
-int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr );
-#elif defined(X17_4WAY)
+#if defined(X17_4WAY)

 void x17_4way_hash( void *state, const void *input );
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -1,4 +1,7 @@
 #include "xevan-gate.h"
+
+#if defined(XEVAN_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -12,7 +15,6 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,515 +25,6 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

-#if defined(XEVAN_8WAY)
-
-union _xevan_8way_context_overlay
-{
-   blake512_8way_context   blake;
-   bmw512_8way_context     bmw;
-   hashState_groestl       groestl;
-   skein512_8way_context   skein;
-   jh512_8way_context      jh;
-   keccak512_8way_context  keccak;
-   luffa_4way_context      luffa;
-   cube_4way_context       cube;
-   sph_shavite512_context  shavite;
-   simd_4way_context       simd;
-   hashState_echo          echo;
-   hamsi512_8way_context   hamsi;
-   sph_fugue512_context    fugue;
-   shabal512_8way_context  shabal;
-   sph_whirlpool_context   whirlpool;
-   sha512_8way_context     sha512;
-   haval256_5_8way_context haval;
-} __attribute__ ((aligned (64)));
-typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
-
-void xevan_8way_hash( void *output, const void *input )
-{
-     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
-     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
-     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
-     uint64_t hash4[16] __attribute__ ((aligned (64)));
-     uint64_t hash5[16] __attribute__ ((aligned (64)));
-     uint64_t hash6[16] __attribute__ ((aligned (64)));
-     uint64_t hash7[16] __attribute__ ((aligned (64)));
-     const int dataLen = 128;
-     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
-
-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, input, 80 );
-     blake512_8way_close( &ctx.blake, vhash );
-     memset( &vhash[8<<3], 0, 64<<3 );
-
-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_8way_close( &ctx.bmw, vhash );
-
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
-                               dataLen<<3 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, dataLen );
-     skein512_8way_close( &ctx.skein, vhash );
-
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
-
-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
-
-     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
-
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
-
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
-
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
-
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
-
-     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
-     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
-
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
-
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
-
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, dataLen<<3 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
-
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
-
-     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
-
-     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
-
-     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
-
-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, vhashA );
-
-     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
-
-     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
-
-     blake512_8way_init( &ctx.blake );
-     blake512_8way_update( &ctx.blake, vhash, dataLen );
-     blake512_8way_close(&ctx.blake, vhash);
-
-     bmw512_8way_init( &ctx.bmw );
-     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
-     bmw512_8way_close( &ctx.bmw, vhash );
-
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
-                               dataLen<<3 );
-     init_groestl( &ctx.groestl, 64 );
-     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
-                               dataLen<<3 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     skein512_8way_init( &ctx.skein );
-     skein512_8way_update( &ctx.skein, vhash, dataLen );
-     skein512_8way_close( &ctx.skein, vhash );
-
-     jh512_8way_init( &ctx.jh );
-     jh512_8way_update( &ctx.jh, vhash, dataLen );
-     jh512_8way_close( &ctx.jh, vhash );
-
-     keccak512_8way_init( &ctx.keccak );
-     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
-     keccak512_8way_close( &ctx.keccak, vhash );
-
-     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
-
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
-
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
-
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
-
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash0, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash0 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash1, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash1 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash2, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash2 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash3, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash3 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash4, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash4 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash5, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash5 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash6, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash6 );
-     sph_shavite512_init( &ctx.shavite );
-     sph_shavite512( &ctx.shavite, hash7, dataLen );
-     sph_shavite512_close( &ctx.shavite, hash7 );
-
-     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
-     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
-
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
-
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
-
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash4,
-                       (const BitSequence *) hash4, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash5,
-                       (const BitSequence *) hash5, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash6,
-                       (const BitSequence *) hash6, dataLen<<3 );
-     init_echo( &ctx.echo, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash7,
-                       (const BitSequence *) hash7, dataLen<<3 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     hamsi512_8way_init( &ctx.hamsi );
-     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
-     hamsi512_8way_close( &ctx.hamsi, vhash );
-
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash0, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash0 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash1, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash1 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash2, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash2 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash3, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash3 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash4, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash4 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash5, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash5 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash6, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash6 );
-     sph_fugue512_init( &ctx.fugue );
-     sph_fugue512( &ctx.fugue, hash7, dataLen );
-     sph_fugue512_close( &ctx.fugue, hash7 );
-
-     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     shabal512_8way_init( &ctx.shabal );
-     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
-     shabal512_8way_close( &ctx.shabal, vhash );
-
-     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash, dataLen<<3 );
-
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash0 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash1 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash2 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash3 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash4 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash5 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash6 );
-     sph_whirlpool_init( &ctx.whirlpool );
-     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
-     sph_whirlpool_close( &ctx.whirlpool, hash7 );
-
-     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, dataLen<<3 );
-
-     sha512_8way_init( &ctx.sha512 );
-     sha512_8way_update( &ctx.sha512, vhash, dataLen );
-     sha512_8way_close( &ctx.sha512, vhash );
-
-     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
-
-     haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
-     haval256_5_8way_close( &ctx.haval, output );
-}
-
-int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-      xevan_8way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
-      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
-         {
-            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(XEVAN_4WAY)
-
 union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -2,10 +2,8 @@

 bool register_xevan_algo( algo_gate_t* gate )
 {
-#if defined (XEVAN_8WAY)
-  gate->scanhash  = (void*)&scanhash_xevan_8way;
-  gate->hash      = (void*)&xevan_8way_hash;
-#elif defined (XEVAN_4WAY)
+#if defined (XEVAN_4WAY)
+//  init_xevan_4way_ctx();
  gate->scanhash  = (void*)&scanhash_xevan_4way;
  gate->hash      = (void*)&xevan_4way_hash;
 #else
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -4,21 +4,13 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define XEVAN_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define XEVAN_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define XEVAN_4WAY
 #endif

 bool register_xevan_algo( algo_gate_t* gate );

-#if defined(XEVAN_8WAY)
-
-void xevan_8way_hash( void *state, const void *input );
-
-int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr );
-#elif defined(XEVAN_4WAY)
+#if defined(XEVAN_4WAY)

 void xevan_4way_hash( void *state, const void *input );

@@ -27,7 +19,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,

 //void init_xevan_4way_ctx();

-#else
+#endif

 void xevan_hash( void *state, const void *input );

@@ -38,4 +30,3 @@ void init_xevan_ctx();

 #endif

-#endif
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.5'
-PACKAGE_STRING='cpuminer-opt 3.10.5'
+PACKAGE_VERSION='3.10.3'
+PACKAGE_STRING='cpuminer-opt 3.10.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.10.5
+cpuminer-opt configure 3.10.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.10.5, which was
+It was created by cpuminer-opt $as_me 3.10.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.10.5'
+ VERSION='3.10.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.5, which was
+This file was extended by cpuminer-opt $as_me 3.10.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.5
+cpuminer-opt config.status 3.10.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.5])
+AC_INIT([cpuminer-opt], [3.10.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -874,9 +874,9 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
-                          x21s\n\
+                          x21s          Pigeoncoin (PGN)\n\
                          x22i\n\
-                          x25x\n\
+                          x25x          Sinovative (SIN)\n\
                          xevan         Bitsend (BSD)\n\
                          yescrypt      Globalboost-Y (BSTY)\n\
                          yescryptr8    BitZeny (ZNY)\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
   *( (uint32_t*)(d06) +(i) ) = s[ 6]; \
   *( (uint32_t*)(d07) +(i) ) = s[ 7]; \
   *( (uint32_t*)(d08) +(i) ) = s[ 8]; \
-   *( (uint32_t*)(d09) +(i) ) = s[ 9]; \
+   *( (uint32_t*)(d09) +(i) ) = s[ 0]; \
   *( (uint32_t*)(d10) +(i) ) = s[10]; \
   *( (uint32_t*)(d11) +(i) ) = s[11]; \
   *( (uint32_t*)(d12) +(i) ) = s[12]; \
@@ -2055,7 +2055,7 @@ static inline void intrlv_2x256( void *dst, const void *src0,
   if ( bit_len <= 512 ) return;
   d[4] = s0[2];
   if ( bit_len <= 640 ) return;
-                      d[5] = s1[2];
+   d[5] = s1[2];
   d[6] = s0[3];      d[7] = s1[3];
 }

@@ -2075,6 +2075,9 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
   d0[3] = s[6];      d1[3] = s[7];
 }

+
+
+
 #endif // AVX

 ///////////////////////////
@@ -2162,9 +2165,7 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] );
   d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] );
   d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] );
-
   if ( bit_len <= 256 ) return;
-
   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] );
   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] );
   d[10] = _mm_unpacklo_epi32( s[10], s[11] );
@@ -2173,21 +2174,16 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[13] = _mm_unpackhi_epi32( s[12], s[13] );
   d[14] = _mm_unpacklo_epi32( s[14], s[15] );
   d[15] = _mm_unpackhi_epi32( s[14], s[15] );
-
   if ( bit_len <= 512 ) return;
-
   d[16] = _mm_unpacklo_epi32( s[16], s[17] );
   d[17] = _mm_unpackhi_epi32( s[16], s[17] );
   d[18] = _mm_unpacklo_epi32( s[18], s[19] );
   d[19] = _mm_unpackhi_epi32( s[18], s[19] );
-
   if ( bit_len <= 640 ) return;
-
   d[20] = _mm_unpacklo_epi32( s[20], s[21] );
   d[21] = _mm_unpackhi_epi32( s[20], s[21] );
   d[22] = _mm_unpacklo_epi32( s[22], s[23] );
   d[23] = _mm_unpackhi_epi32( s[22], s[23] );
-
   d[24] = _mm_unpacklo_epi32( s[24], s[25] );
   d[25] = _mm_unpackhi_epi32( s[24], s[25] );
   d[26] = _mm_unpacklo_epi32( s[26], s[27] );
@@ -2198,93 +2194,6 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[31] = _mm_unpackhi_epi32( s[30], s[31] );
 }

-// 8x32 -> 8x64
-
-static inline void rintrlv_8x32_8x64( void *dst,
-                                      const void *src, const int bit_len )
-{
-   __m128i *d = (__m128i*)dst;
-   const __m128i *s = (const __m128i*)src;
-
-   d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] );
-   d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] );
-   d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] );
-   d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] );
-   d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] );
-   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] );
-   d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] );
-   d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] );
-
-   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] );
-   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] );
-   d[10] = _mm_unpacklo_epi32( s[ 9], s[11] );
-   d[11] = _mm_unpackhi_epi32( s[ 9], s[11] );
-   d[12] = _mm_unpacklo_epi32( s[12], s[14] );
-   d[13] = _mm_unpackhi_epi32( s[12], s[14] );
-   d[14] = _mm_unpacklo_epi32( s[13], s[15] );
-   d[15] = _mm_unpackhi_epi32( s[13], s[15] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[16] = _mm_unpacklo_epi32( s[16], s[18] );
-   d[17] = _mm_unpackhi_epi32( s[16], s[18] );
-   d[18] = _mm_unpacklo_epi32( s[17], s[19] );
-   d[19] = _mm_unpackhi_epi32( s[17], s[19] );
-   d[20] = _mm_unpacklo_epi32( s[20], s[22] );
-   d[21] = _mm_unpackhi_epi32( s[20], s[22] );
-   d[22] = _mm_unpacklo_epi32( s[21], s[23] );
-   d[23] = _mm_unpackhi_epi32( s[21], s[23] );
-
-   d[24] = _mm_unpacklo_epi32( s[24], s[26] );
-   d[25] = _mm_unpackhi_epi32( s[24], s[26] );
-   d[26] = _mm_unpacklo_epi32( s[25], s[27] );
-   d[27] = _mm_unpackhi_epi32( s[25], s[27] );
-   d[28] = _mm_unpacklo_epi32( s[28], s[30] );
-   d[29] = _mm_unpackhi_epi32( s[28], s[30] );
-   d[30] = _mm_unpacklo_epi32( s[29], s[31] );
-   d[31] = _mm_unpackhi_epi32( s[29], s[31] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[32] = _mm_unpacklo_epi32( s[32], s[34] );
-   d[33] = _mm_unpackhi_epi32( s[32], s[34] );
-   d[34] = _mm_unpacklo_epi32( s[33], s[35] );
-   d[35] = _mm_unpackhi_epi32( s[33], s[35] );
-   d[36] = _mm_unpacklo_epi32( s[36], s[38] );
-   d[37] = _mm_unpackhi_epi32( s[36], s[38] );
-   d[38] = _mm_unpacklo_epi32( s[37], s[39] );
-   d[39] = _mm_unpackhi_epi32( s[37], s[39] );
-
-   d[40] = _mm_unpacklo_epi32( s[40], s[42] );
-   d[41] = _mm_unpackhi_epi32( s[40], s[42] );
-   d[42] = _mm_unpacklo_epi32( s[41], s[43] );
-   d[43] = _mm_unpackhi_epi32( s[41], s[43] );
-   d[44] = _mm_unpacklo_epi32( s[44], s[46] );
-   d[45] = _mm_unpackhi_epi32( s[44], s[46] );
-   d[46] = _mm_unpacklo_epi32( s[45], s[47] );
-   d[47] = _mm_unpackhi_epi32( s[45], s[47] );
-
-   d[48] = _mm_unpacklo_epi32( s[48], s[50] );
-   d[49] = _mm_unpackhi_epi32( s[48], s[50] );
-   d[50] = _mm_unpacklo_epi32( s[49], s[51] );
-   d[51] = _mm_unpackhi_epi32( s[49], s[51] );
-   d[52] = _mm_unpacklo_epi32( s[52], s[54] );
-   d[53] = _mm_unpackhi_epi32( s[52], s[54] );
-   d[54] = _mm_unpacklo_epi32( s[53], s[55] );
-   d[55] = _mm_unpackhi_epi32( s[53], s[55] );
-
-   d[56] = _mm_unpacklo_epi32( s[56], s[58] );
-   d[57] = _mm_unpackhi_epi32( s[56], s[58] );
-   d[58] = _mm_unpacklo_epi32( s[57], s[59] );
-   d[59] = _mm_unpackhi_epi32( s[57], s[59] );
-   d[60] = _mm_unpacklo_epi32( s[60], s[62] );
-   d[61] = _mm_unpackhi_epi32( s[60], s[62] );
-   d[62] = _mm_unpacklo_epi32( s[61], s[63] );
-   d[63] = _mm_unpackhi_epi32( s[61], s[63] );
-}
-
-
-
 /*
 #define RLEAVE_4x32_4x64(i) do \
 { \
@@ -2316,6 +2225,7 @@ static inline void rintrlv_4x32_4x64( void *dst,

 // 2x128 -> 4x64

+
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                       const void *src1, const int bit_len )
 {
@@ -2358,6 +2268,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
 }

+
 /*
 #define RLEAVE_2x128_4x64( i ) do \
 { \
@@ -2428,6 +2339,7 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
   d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
 }

+
 /*
 #define RLEAVE_4x64_2x128( i ) do \
 { \
@@ -2452,354 +2364,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 }
 */

-// 2x128 -> 8x64
-
-static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
-                                       const void *src1, const int bit_len )
-{
-   __m128i *d = (__m128i*)dst;
-   const __m128i *s0 = (const __m128i*)src0;
-   const __m128i *s1 = (const __m128i*)src1;
-
-   d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
-   d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
-   d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
-   d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
-   d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
-   d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
-   d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
-   d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
-
-   d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
-   d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
-   d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
-   d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
-   d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
-   d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
-   d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
-   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
-   d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
-   d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
-   d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
-   d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
-   d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
-   d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
-   d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
-
-   d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
-   d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
-   d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
-   d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
-   d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
-   d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
-   d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
-   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
-   d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
-   d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
-   d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
-   d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
-   d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
-   d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
-   d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
-
-   d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
-   d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
-   d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
-   d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
-   d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
-   d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
-   d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
-   d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
-
-   d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
-   d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
-   d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
-   d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
-   d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
-   d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
-   d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
-   d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
-
-   d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
-   d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
-   d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
-   d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
-   d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
-   d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
-   d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
-   d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
-}
-
-// 8x64 -> 4x128
-
-static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
-                                       const void *src, const int bit_len )
-{
-   __m128i *d0 = (__m128i*)dst0;
-   __m128i *d1 = (__m128i*)dst1;
-   const __m128i* s = (const __m128i*)src;
-
-   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
-   d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
-   d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
-   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
-   d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
-   d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
-   d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
-   d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
-
-   d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
-   d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
-   d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
-   d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
-   d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
-   d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
-   d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
-   d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
-   d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
-   d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
-   d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
-   d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
-   d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
-   d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
-   d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
-
-   d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
-   d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
-   d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
-   d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
-   d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
-   d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
-   d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
-   d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
-
-   if ( bit_len <= 512 ) return;
-
-   d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
-   d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
-   d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
-   d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
-   d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
-   d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
-   d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
-   d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
-
-   d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
-   d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
-   d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
-   d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
-   d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
-   d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
-   d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
-   d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
-
-   d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
-   d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
-   d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
-   d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
-   d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
-   d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
-   d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
-   d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
-
-   d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
-   d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
-   d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
-   d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
-   d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
-   d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
-   d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
-   d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
-}
-
-// 8x64 -> 2x256
-
-static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
-                          void *dst3,  const void *src, const int bit_len )
-{
-   __m128i *d0 = (__m128i*)dst0;
-   __m128i *d1 = (__m128i*)dst1;
-   __m128i *d2 = (__m128i*)dst2;
-   __m128i *d3 = (__m128i*)dst3;
-   const __m128i* s = (const __m128i*)src;
-
-   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
-   d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
-   d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );   
-   d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
-   d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); 
-   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
-   d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); 
-   d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
-   
-   d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); 
-   d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
-   d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); 
-   d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
-   d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
-   d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
-   d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
-   d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
-   d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
-   d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
-   d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
-   d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
-   d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
-   d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
-   d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
-   
-   d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
-   d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
-   d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
-   d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
-   d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
-   d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
-   d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
-   d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
-
-   if ( bit_len <= 512 ) return;
-
-   d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
-   d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
-   d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
-   d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
-   d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
-   d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
-   d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
-   d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
-
-   d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
-   d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
-   d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
-   d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
-   d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
-   d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
-   d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
-   d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
-
-   d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
-   d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
-   d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
-   d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
-   d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
-   d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
-   d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
-   d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
-
-   d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
-   d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
-   d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
-   d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
-   d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
-   d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
-   d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
-   d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
-}
-
-// 4x128 -> 8x64
-
-static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
-      const void *src1, const void *src2, const void *src3, const int bit_len )
-{
-   __m128i *d = (__m128i*)dst;
-   __m128i *s0 = (__m128i*)src0;
-   __m128i *s1 = (__m128i*)src1;
-   __m128i *s2 = (__m128i*)src2;
-   __m128i *s3 = (__m128i*)src3;
-
-   d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
-   d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
-   d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
-   d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
-   d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
-   d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
-   d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
-   d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
-
-   d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
-   d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
-   d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
-   d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
-   d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
-   d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
-   d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
-   d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
-   d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
-   d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
-   d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
-   d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
-   d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
-   d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
-   d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
-
-   d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
-   d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
-   d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
-   d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
-   d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
-   d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
-   d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
-   d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
-   d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
-   d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
-   d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
-   d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
-   d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
-   d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
-   d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
-
-   d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
-   d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
-   d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
-   d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
-   d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
-   d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
-   d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
-   d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
-
-   d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
-   d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
-   d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
-   d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
-   d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
-   d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
-   d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
-   d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
-
-   d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
-   d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
-   d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
-   d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
-   d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
-   d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
-   d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
-   d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
-}
-
 //
 // Some functions customized for mining.

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -270,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))

 #define mm512_bswap_32( v ) \
   _mm512_shuffle_epi8( v, \