v3.19.9

2025-09-17 23:44:27 +00:00 · 2022-07-10 11:04:00 -04:00
parent 26b8927632
commit f552f2b1e8
27 changed files with 883 additions and 396 deletions
--- a/.RELEASE_NOTES.swp
+++ b/.RELEASE_NOTES.swp
--- a/9
+++ b/9
@@ -22,7 +22,7 @@ required.
 Compile Instructions
 --------------------

-See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions

 Requirements
 ------------
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.19.9
+
+More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
+Relaxed some excessively strict data alignment that was negatively affecting performance.
+
 v3.19.8

 #370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
@@ -75,7 +80,7 @@ log and the periodic summary log.

 Small optimizations to Cubehash, AVX2 & AVX512.

-Byte order and prehash optimizations for blake256 & blake512, AVX2 & AVX512.
+Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.

 v3.19.7

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -49,6 +49,20 @@ extern "C"{

 #define SPH_SIZE_blake512   512

+/////////////////////////
+//
+//  Blake-256 1 way SSE2
+
+void  blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                             const uint32_t T0, const uint32_t T1 );
+
+/////////////////////////
+//
+//  Blake-512 1 way SSE2
+
+void  blake512_transform_le( uint64_t *H, const uint64_t *buf,
+                             const uint64_t T0, const uint64_t T1 );
+
 //////////////////////////
 //
 //   Blake-256 4 way SSE2
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -5,6 +5,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *               2016-2022  JayDDee246@gmail.com
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {

 #endif

+/////////////////////////////////////////
+//
+// Blake-256 1 way SIMD
+
+#define BLAKE256_ROUND( r ) \
+{ \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
+                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
+                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
+                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
+                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shufll_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shuflr_32( V1 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
+                                          CSx( r, D ) ^ Mx( r, C ), \
+                                          CSx( r, B ) ^ Mx( r, A ), \
+                                          CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
+                                          CSx( r, C ) ^ Mx( r, D ), \
+                                          CSx( r, A ) ^ Mx( r, B ), \
+                                          CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shuflr_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shufll_32( V1 ); \
+}
+
+void blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                            const uint32_t T0, const uint32_t T1 )
+{
+   __m128i V0, V1, V2, V3;
+   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+   V0 = casti_m128i( H, 0 );
+   V1 = casti_m128i( H, 1 );
+   V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+                       T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
+   M0 = buf[ 0];
+   M1 = buf[ 1];
+   M2 = buf[ 2];
+   M3 = buf[ 3];
+   M4 = buf[ 4];
+   M5 = buf[ 5];
+   M6 = buf[ 6];
+   M7 = buf[ 7];
+   M8 = buf[ 8];
+   M9 = buf[ 9];
+   MA = buf[10];
+   MB = buf[11];
+   MC = buf[12];
+   MD = buf[13];
+   ME = buf[14];
+   MF = buf[15];
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   BLAKE256_ROUND( 4 );
+   BLAKE256_ROUND( 5 );
+   BLAKE256_ROUND( 6 );
+   BLAKE256_ROUND( 7 );
+   BLAKE256_ROUND( 8 );
+   BLAKE256_ROUND( 9 );
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
+   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
+}
+
+////////////////////////////////////////////
+//
 // Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -544,6 +637,8 @@ do { \

 #if defined (__AVX2__)

+/////////////////////////////////
+//
 // Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -778,6 +873,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // G4   
   V[ 0] = _mm256_add_epi32( V[ 0],
                         _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
+
+   // G6   
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
+
+   // G7   
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm256_add_epi32( V[ 3],
+                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
 }

 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
@@ -844,10 +950,26 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );

-   // G5,G6,G7
+   // G5
   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
-   GS_8WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
-   GS_8WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
+
+   // G6
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
+   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
+
+   // G7
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
+   V3 = _mm256_add_epi32( V3, V4 );
+   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );

   // Remaining rounds   
   ROUND_S_8WAY( 1 );
@@ -878,12 +1000,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
 }

-
 #endif

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+///////////////////////////////////////
+//
 // Blake-256 16 way AVX512

 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -1078,10 +1200,10 @@ do { \
   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)

-
-// data points to a prefilled final block containing the last 16 bytes of the
-// blockheader plus padding. midhash is the hash from the first block.
-// Prehash as much as possible without the nonce.
+// Blake-256 prehash of the second block is split onto 2 parts. The first part
+// is constant for every nonce and only needs to be run once per job. The
+// second part is run for each nonce using the precalculated midstate and the
+// hash from the first block.
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
                                       const void *data )
 {
@@ -1106,11 +1228,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[14] = m512_const1_32( CS6 );
   V[15] = m512_const1_32( CS7 );

-// G0   
+   // G0   
   GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );

-// G1   
-//  GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD);
+   // G1, nonce is in M[3]   
+   // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
   V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
                         _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
   V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
@@ -1118,21 +1240,29 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );

-
-// G2,G3
+   // G2,G3
   GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
   GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );

-// G4   
-//   GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF);
+   // G4   
+   // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
   V[ 0] = _mm512_add_epi32( V[ 0],
                         _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); 
   
-// G5,G6,G7
-//   GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC);
-//   GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD);
-//   GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE);
+   // G5
+   // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );

+   // G6   
+   // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
+   // G7   
+   // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm512_add_epi32( V[ 3],
+                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
 }

 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
@@ -1180,13 +1310,12 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   ME = casti_m512i( data, 14 ); 
   MF = casti_m512i( data, 15 ); 

-   // Finish round 0   
+   // Finish round 0 with the nonce (M3) now available
   // G0   
-   // GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+   // GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );

   // G1   
   // GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
-
   V1 = _mm512_add_epi32( V1, 
                         _mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
   VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
@@ -1199,7 +1328,6 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,

   // G4
   // GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
-
   V0 = _mm512_add_epi32( V0, V5 );
   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
   VA = _mm512_add_epi32( VA, VF );
@@ -1210,10 +1338,28 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   VA = _mm512_add_epi32( VA, VF );
   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );

-   // G5,G6,G7
+   // G5
   GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
-   GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
-   GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
+
+   // G6
+   // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
+   V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
+
+   // G7
+   // GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
+   V3 = _mm512_add_epi32( V3, V4 );
+   VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );

   // Remaining rounds   
   ROUND_S_16WAY( 1 );
@@ -1230,6 +1376,7 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   ROUND_S_16WAY( 2 );
   ROUND_S_16WAY( 3 );

+   // Byte swap final hash
   const __m512i shuf_bswap32 =
                  m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
                                 0x2c2d2e2f28292a2b, 0x2425262720212223,
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -599,13 +599,13 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
   VE = _mm512_set1_epi64( CB6 );
   VF = _mm512_set1_epi64( CB7 );

-   // skip the nonce
+   // round 0
   GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
   GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
   GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
   GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );

-   // Do half of G4
+   // Do half of G4, skip the nonce
   // GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );

   V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
@@ -619,6 +619,21 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
   GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
   GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
   
+   // round 1
+   // G1   
+//   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
+   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+   V2 = _mm512_add_epi64( V2, V6 ); 
+
+   // G3
+//   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
+
   // save midstate for second part
   midstate[ 0] = V0;
   midstate[ 1] = V1;
@@ -689,9 +704,61 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
   VA = _mm512_add_epi64( VA, VF );
   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
-   
+  
+   // Round 1
+   // G0
+   GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+//   GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
+//   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
+
+   V1 = _mm512_add_epi64( V1, V5 );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
+   V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+//   V2 = _mm512_add_epi64( V2, V6 );
+   V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
+                 _mm512_set1_epi64( CBF ), M9 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
+   V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB9 ), MF ), V6 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
+
+   // G3
+//   GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
+//   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
+//                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 
+
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
+
   // remaining rounds  
-   ROUND_B_8WAY(1);
   ROUND_B_8WAY(2);
   ROUND_B_8WAY(3);
   ROUND_B_8WAY(4);
@@ -1202,12 +1269,13 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   VE = _mm256_set1_epi64x( CB6 );
   VF = _mm256_set1_epi64x( CB7 );

+   // round 0
   GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
   GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
   GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
   GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );

-   // skip nonce
+   // G4 skip nonce
   V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
                       _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
@@ -1218,7 +1286,19 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
   GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
   GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
-   
+
+   // round 1
+   // G1   
+   V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, V6 );
+
+   // G3
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
+
   // save midstate for second part
   midstate[ 0] = V0;
   midstate[ 1] = V1;
@@ -1289,7 +1369,49 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
   VA = _mm256_add_epi64( VA, VF );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );

-   ROUND_B_4WAY(1);
+   // Round 1
+   // G0
+   GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+   V1 = _mm256_add_epi64( V1, V5 );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
+   V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBF ), M9 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
+   V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
+
+   // G3
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
   ROUND_B_4WAY(2);
   ROUND_B_4WAY(3);
   ROUND_B_4WAY(4);
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
 		H7 ^= S3 ^ V7 ^ VF; \
 	} while (0)

+#define COMPRESS32_LE   do { \
+      sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+      sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+      sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+      sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+      V0 = H0; \
+      V1 = H1; \
+      V2 = H2; \
+      V3 = H3; \
+      V4 = H4; \
+      V5 = H5; \
+      V6 = H6; \
+      V7 = H7; \
+      V8 = S0 ^ CS0; \
+      V9 = S1 ^ CS1; \
+      VA = S2 ^ CS2; \
+      VB = S3 ^ CS3; \
+      VC = T0 ^ CS4; \
+      VD = T0 ^ CS5; \
+      VE = T1 ^ CS6; \
+      VF = T1 ^ CS7; \
+      M0 = *((uint32_t*)(buf +  0)); \
+      M1 = *((uint32_t*)(buf +  4)); \
+      M2 = *((uint32_t*)(buf +  8)); \
+      M3 = *((uint32_t*)(buf + 12)); \
+      M4 = *((uint32_t*)(buf + 16)); \
+      M5 = *((uint32_t*)(buf + 20)); \
+      M6 = *((uint32_t*)(buf + 24)); \
+      M7 = *((uint32_t*)(buf + 28)); \
+      M8 = *((uint32_t*)(buf + 32)); \
+      M9 = *((uint32_t*)(buf + 36)); \
+      MA = *((uint32_t*)(buf + 40)); \
+      MB = *((uint32_t*)(buf + 44)); \
+      MC = *((uint32_t*)(buf + 48)); \
+      MD = *((uint32_t*)(buf + 52)); \
+      ME = *((uint32_t*)(buf + 56)); \
+      MF = *((uint32_t*)(buf + 60)); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      ROUND_S(4); \
+      ROUND_S(5); \
+      ROUND_S(6); \
+      ROUND_S(7); \
+      if (BLAKE32_ROUNDS == 14) { \
+      ROUND_S(8); \
+      ROUND_S(9); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      } \
+      H0 ^= S0 ^ V0 ^ V8; \
+      H1 ^= S1 ^ V1 ^ V9; \
+      H2 ^= S2 ^ V2 ^ VA; \
+      H3 ^= S3 ^ V3 ^ VB; \
+      H4 ^= S0 ^ V4 ^ VC; \
+      H5 ^= S1 ^ V5 ^ VD; \
+      H6 ^= S2 ^ V6 ^ VE; \
+      H7 ^= S3 ^ V7 ^ VF; \
+   } while (0)
+
 #endif

 #if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
 	sc->ptr = ptr;
 }

+static void
+blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
+{
+   unsigned char *buf;
+   size_t ptr;
+   DECL_STATE32
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if (len < (sizeof sc->buf) - ptr) {
+      memcpy(buf + ptr, data, len);
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE32(sc);
+   while (len > 0) {
+      size_t clen;
+
+      clen = (sizeof sc->buf) - ptr;
+      if (clen > len)
+         clen = len;
+      memcpy(buf + ptr, data, clen);
+      ptr += clen;
+      data = (const unsigned char *)data + clen;
+      len -= clen;
+      if (ptr == sizeof sc->buf) {
+         if ((T0 = SPH_T32(T0 + 512)) < 512)
+            T1 = SPH_T32(T1 + 1);
+         COMPRESS32_LE;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE32(sc);
+   sc->ptr = ptr;
+}
+
 static void
 blake32_close(sph_blake_small_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
 	blake32(cc, data, len);
 }

+void
+sph_blake256_update_le(void *cc, const void *data, size_t len)
+{
+   blake32_le(cc, data, len);
+}
+
 /* see sph_blake.h */
 void
 sph_blake256_close(void *cc, void *dst)
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
 * @param len    the input data length (in bytes)
 */
 void sph_blake256(void *cc, const void *data, size_t len);
+void sph_blake256_update_le(void *cc, const void *data, size_t len);

 /**
 * Terminate the current BLAKE-256 computation and output the result into
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -15,11 +15,11 @@

 struct _cubehashParam
 {
+    __m128i _ALIGN(64) x[8];  // aligned for __m512i
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(64) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -13,8 +13,7 @@

 #if defined (ALLIUM_16WAY)  

-typedef struct {
-   blake256_16way_context     blake;
+typedef union {
   keccak256_8way_context    keccak;
   cube_4way_2buf_context    cube;
   skein256_8way_context     skein;
@@ -25,43 +24,31 @@ typedef struct {
 #endif
 } allium_16way_ctx_holder;

-static __thread allium_16way_ctx_holder allium_16way_ctx;
-static __thread __m512i blake256_16way_midstate[16];
-
-bool init_allium_16way_ctx()
-{
-   keccak256_8way_init( &allium_16way_ctx.keccak );
-   skein256_8way_init( &allium_16way_ctx.skein );
-   return true;
-}
-
-void allium_16way_hash( void *state, const void *input )
+static void allium_16way_hash( void *state, const void *midstate_vars, 
+                               const void *midhash, const void *block )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   uint32_t hash8[8] __attribute__ ((aligned (64)));
-   uint32_t hash9[8] __attribute__ ((aligned (64)));
-   uint32_t hash10[8] __attribute__ ((aligned (64)));
-   uint32_t hash11[8] __attribute__ ((aligned (64)));
-   uint32_t hash12[8] __attribute__ ((aligned (64)));
-   uint32_t hash13[8] __attribute__ ((aligned (64)));
-   uint32_t hash14[8] __attribute__ ((aligned (64)));
-   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (32)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   uint32_t hash8[8] __attribute__ ((aligned (32)));
+   uint32_t hash9[8] __attribute__ ((aligned (32)));
+   uint32_t hash10[8] __attribute__ ((aligned (32)));
+   uint32_t hash11[8] __attribute__ ((aligned (32)));
+   uint32_t hash12[8] __attribute__ ((aligned (32)));
+   uint32_t hash13[8] __attribute__ ((aligned (32)));
+   uint32_t hash14[8] __attribute__ ((aligned (32)));
+   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
-   ctx.blake.buf[3] = casti_m512i( input, 19 ); // grab nonce from input
-   blake256_16way_final_rounds_le( vhash, blake256_16way_midstate, ctx.blake.H,
-                                   ctx.blake.buf );    
+   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -71,6 +58,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
+   keccak256_8way_init( &ctx.keccak );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
   keccak256_8way_close( &ctx.keccak, vhashA);
   keccak256_8way_init( &ctx.keccak );
@@ -153,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

+   skein256_8way_init( &ctx.skein );
   skein256_8way_update( &ctx.skein, vhashA, 32 );
   skein256_8way_close( &ctx.skein, vhashA );
   skein256_8way_init( &ctx.skein );
@@ -208,41 +197,64 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) = 
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m512i sixteen = m512_const1_32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   for ( int i = 0; i < 19; i++ )
-      casti_m512i( vdata, i ) = _mm512_set1_epi32( pdata[i] );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   // Prehash first block.
+   blake256_transform_le( phash, pdata, 512, 0 );

-   // Prehash first block
-   blake256_16way_init( &allium_16way_ctx.blake );
-   blake256_16way_update_le( &allium_16way_ctx.blake, vdata, 64 );
-   
-   // Prehash second block, fill buf with last 16 bytes and add padding.
-   memcpy_512( allium_16way_ctx.blake.buf, (__m512i*)vdata + 16, 4 );
-   allium_16way_ctx.blake.buf[ 4] = m512_const1_32( 0x80000000 );
-   memset_zero_512( allium_16way_ctx.blake.buf + 5, 8 );
-   allium_16way_ctx.blake.buf[13] = m512_one_32;
-   allium_16way_ctx.blake.buf[14] = m512_zero;
-   allium_16way_ctx.blake.buf[15] = m512_const1_32( 80*8 );
+   // Interleave hash for second block prehash.
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );

-   blake256_16way_round0_prehash_le( blake256_16way_midstate,
-                      allium_16way_ctx.blake.H, allium_16way_ctx.blake.buf );
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces, add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] = 
+   block_buf[ 7] = 
+   block_buf[ 8] = 
+   block_buf[ 9] = 
+   block_buf[10] = 
+   block_buf[11] = 
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_16way_hash( hash, vdata );
+     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 16; lane++ ) 
     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
@@ -250,7 +262,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
        pdata[19] = n + lane;
        submit_solution( work, hash+(lane<<3), mythr );
     }
-     *noncev = _mm512_add_epi32( *noncev, sixteen );
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen ); 
     n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
@@ -260,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,

 #elif defined (ALLIUM_8WAY)  

-typedef struct {
-   blake256_8way_context     blake;
+typedef union {
   keccak256_4way_context    keccak;
   cube_2way_context         cube;
   skein256_4way_context     skein;
@@ -272,20 +283,11 @@ typedef struct {
 #endif
 } allium_8way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
-static __thread __m256i blake256_8way_midstate[16];
-
-bool init_allium_8way_ctx()
-{
-   keccak256_4way_init( &allium_8way_ctx.keccak );
-   skein256_4way_init( &allium_8way_ctx.skein );
-   return true;
-}
-
-void allium_8way_hash( void *hash, const void *input )
+static void allium_8way_hash( void *hash, const void *midstate_vars,
+                               const void *midhash, const void *block )
 {
   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
   uint64_t *hash0 = (uint64_t*)hash;
   uint64_t *hash1 = (uint64_t*)hash+ 4;
   uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -296,16 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   ctx.blake.buf[3] = casti_m256i( input, 19 ); // grab nonce from input
-   blake256_8way_final_rounds_le( vhashA, blake256_8way_midstate, ctx.blake.H,
-                                   ctx.blake.buf );
+   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   keccak256_4way_init( &ctx.keccak );
   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
   keccak256_4way_close( &ctx.keccak, vhashA );
   keccak256_4way_init( &ctx.keccak );
@@ -324,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

-
   intrlv_2x128( vhashA, hash0, hash1, 256 );
   intrlv_2x128( vhashB, hash2, hash3, 256 );
   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
@@ -351,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   skein256_4way_init( &ctx.skein );
   skein256_4way_update( &ctx.skein, vhashA, 32 );
   skein256_4way_close( &ctx.skein, vhashA );
   skein256_4way_init( &ctx.skein );
@@ -359,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )

 #if defined(__VAES__)

-   uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
-   uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
+   uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
+   uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
   
   rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -395,37 +395,60 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;  
   const bool bench = opt_benchmark;
-
-   for ( int i = 0; i < 19; i++ )
-      casti_m256i( vdata, i ) = _mm256_set1_epi32( pdata[i] );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+   const __m256i eight = m256_const1_32( 8 );

   // Prehash first block
-   blake256_8way_init( &allium_8way_ctx.blake );
-   blake256_8way_update_le( &allium_8way_ctx.blake, vdata, 64 );
+   blake256_transform_le( phash, pdata, 512, 0 );

-   // Prehash second block, fill buf with last 16 bytes and add padding.
-   memcpy_256( allium_8way_ctx.blake.buf, (__m256i*)vdata + 16, 4 );
-   allium_8way_ctx.blake.buf[ 4] = m256_const1_32( 0x80000000 );
-   memset_zero_256( allium_8way_ctx.blake.buf + 5, 8 );
-   allium_8way_ctx.blake.buf[13] = m256_one_32;
-   allium_8way_ctx.blake.buf[14] = m256_zero;
-   allium_8way_ctx.blake.buf[15] = m256_const1_32( 80*8 );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );

-   blake256_8way_round0_prehash_le( blake256_8way_midstate,
-                      allium_8way_ctx.blake.H, allium_8way_ctx.blake.buf );
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_8way_hash( hash, vdata );
+     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 8; lane++ )
     {
@@ -437,7 +460,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
        }
     }
     n += 8;
-     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
-  gate->hash       = (void*)&lyra2z_16way_hash;
+//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
-  gate->hash       = (void*)&lyra2z_8way_hash;
+//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
-  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_16way;
-  gate->hash      = (void*)&allium_16way_hash;
 #elif defined (ALLIUM_8WAY)
-  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
-  gate->hash      = (void*)&allium_8way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();

 #if defined(LYRA2Z_16WAY)

-void lyra2z_16way_hash( void *state, const void *input );
+//void lyra2z_16way_hash( void *state, const void *input );
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_16way_thread_init();

 #elif defined(LYRA2Z_8WAY)

-void lyra2z_8way_hash( void *state, const void *input );
+//void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );

 #if defined(ALLIUM_16WAY)

-void allium_16way_hash( void *state, const void *input );
 int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_16way_ctx();

 #elif defined(ALLIUM_8WAY)

-void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_8way_ctx();

 #else

--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -14,53 +14,28 @@ bool lyra2z_16way_thread_init()
 return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_16way_context l2z_16way_blake_ctx;
-static __thread __m512i blake256_16way_midstate[16];
-
-void lyra2z_16way_midstate( const void* input )
-{
-   // First block
-   blake256_16way_init( &l2z_16way_blake_ctx );
-   blake256_16way_update_le( &l2z_16way_blake_ctx, input, 64 );
-
-   // Second block
-   memcpy_512( l2z_16way_blake_ctx.buf, (__m512i*)input + 16, 4 );
-   l2z_16way_blake_ctx.buf[ 4] = m512_const1_32( 0x80000000 );
-   memset_zero_512( l2z_16way_blake_ctx.buf + 5, 8 );
-   l2z_16way_blake_ctx.buf[13] = m512_one_32;
-   l2z_16way_blake_ctx.buf[14] = m512_zero;
-   l2z_16way_blake_ctx.buf[15] = m512_const1_32( 80*8 );
-
-   blake256_16way_round0_prehash_le( blake256_16way_midstate,
-                      l2z_16way_blake_ctx.H, l2z_16way_blake_ctx.buf );
-}
-
-void lyra2z_16way_hash( void *state, const void *input )
+static void lyra2z_16way_hash( void *state, const void *midstate_vars,
+                        const void *midhash, const void *block )
 {
    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
-    uint32_t hash0[8] __attribute__ ((aligned (64)));
-    uint32_t hash1[8] __attribute__ ((aligned (64)));
-    uint32_t hash2[8] __attribute__ ((aligned (64)));
-    uint32_t hash3[8] __attribute__ ((aligned (64)));
-    uint32_t hash4[8] __attribute__ ((aligned (64)));
-    uint32_t hash5[8] __attribute__ ((aligned (64)));
-    uint32_t hash6[8] __attribute__ ((aligned (64)));
-    uint32_t hash7[8] __attribute__ ((aligned (64)));
-    uint32_t hash8[8] __attribute__ ((aligned (64)));
-    uint32_t hash9[8] __attribute__ ((aligned (64)));
-    uint32_t hash10[8] __attribute__ ((aligned (64)));
-    uint32_t hash11[8] __attribute__ ((aligned (64)));
-    uint32_t hash12[8] __attribute__ ((aligned (64)));
-    uint32_t hash13[8] __attribute__ ((aligned (64)));
-    uint32_t hash14[8] __attribute__ ((aligned (64)));
-    uint32_t hash15[8] __attribute__ ((aligned (64)));
-    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+    uint32_t hash0[8] __attribute__ ((aligned (32)));
+    uint32_t hash1[8] __attribute__ ((aligned (32)));
+    uint32_t hash2[8] __attribute__ ((aligned (32)));
+    uint32_t hash3[8] __attribute__ ((aligned (32)));
+    uint32_t hash4[8] __attribute__ ((aligned (32)));
+    uint32_t hash5[8] __attribute__ ((aligned (32)));
+    uint32_t hash6[8] __attribute__ ((aligned (32)));
+    uint32_t hash7[8] __attribute__ ((aligned (32)));
+    uint32_t hash8[8] __attribute__ ((aligned (32)));
+    uint32_t hash9[8] __attribute__ ((aligned (32)));
+    uint32_t hash10[8] __attribute__ ((aligned (32)));
+    uint32_t hash11[8] __attribute__ ((aligned (32)));
+    uint32_t hash12[8] __attribute__ ((aligned (32)));
+    uint32_t hash13[8] __attribute__ ((aligned (32)));
+    uint32_t hash14[8] __attribute__ ((aligned (32)));
+    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    memcpy( &ctx_blake, &l2z_16way_blake_ctx, sizeof l2z_16way_blake_ctx );
-
-    ctx_blake.buf[3] = casti_m512i( input, 19 ); // grab nonce from input
-    blake256_16way_final_rounds_le( vhash, blake256_16way_midstate, ctx_blake.H,
-                                    ctx_blake.buf );
+    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -112,42 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m512i sixteen = m512_const1_32( 16 );

-   if ( bench )   ptarget[7] = 0x0000ff;
+   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   for ( int i = 0; i < 19; i++ )
-      casti_m512i( vdata, i ) = _mm512_set1_epi32( pdata[i] );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );
+
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   lyra2z_16way_midstate( vdata );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_16way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = n + lane;
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
-      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );

+     for ( int lane = 0; lane < 16; lane++ )
+     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
+     {
+        pdata[19] = n + lane;
+        submit_solution( work, hash+(lane<<3), mythr );
+     }
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
+     n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
@@ -162,43 +169,20 @@ bool lyra2z_8way_thread_init()
 return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_8way_context l2z_8way_blake_ctx;
-static __thread __m256i blake256_8way_midstate[16];
-
-void lyra2z_8way_midstate( const void* input )
-{
-   blake256_8way_init( &l2z_8way_blake_ctx );
-   blake256_8way_update_le( &l2z_8way_blake_ctx, input, 64 );
-
-   memcpy_256( l2z_8way_blake_ctx.buf, (__m256i*)input + 16, 4 );
-   l2z_8way_blake_ctx.buf[ 4] = m256_const1_32( 0x80000000 );
-   memset_zero_256( l2z_8way_blake_ctx.buf + 5, 8 );
-   l2z_8way_blake_ctx.buf[13] = m256_one_32;
-   l2z_8way_blake_ctx.buf[14] = m256_zero;
-   l2z_8way_blake_ctx.buf[15] = m256_const1_32( 80*8 );
-
-   blake256_8way_round0_prehash_le( blake256_8way_midstate,
-                      l2z_8way_blake_ctx.H, l2z_8way_blake_ctx.buf );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
+static void lyra2z_8way_hash( void *state, const void *midstate_vars,
+                       const void *midhash, const void *block )
 {
     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     uint32_t hash4[8] __attribute__ ((aligned (32)));
+     uint32_t hash5[8] __attribute__ ((aligned (32)));
+     uint32_t hash6[8] __attribute__ ((aligned (32)));
+     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));

-     memcpy( &ctx_blake, &l2z_8way_blake_ctx, sizeof l2z_8way_blake_ctx );
-
-     ctx_blake.buf[3] = casti_m256i( input, 19 ); // grab nonce from input
-     blake256_8way_final_rounds_le( vhash, blake256_8way_midstate, ctx_blake.H,
-                                    ctx_blake.buf );
+     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -212,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );

-
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -227,45 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
   const __m256i eight = m256_const1_32( 8 );

-   if ( bench )  ptarget[7] = 0x0000ff;
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   for ( int i = 0; i < 19; i++ )
-      casti_m256i( vdata, i ) = _mm256_set1_epi32( pdata[i] );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   lyra2z_8way_midstate( vdata );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_8way_hash( hash, vdata );
+     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );

-      for ( int lane = 0; lane < 8; lane++ )
-      {
+     for ( int lane = 0; lane < 8; lane++ )
+     {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
           pdata[19] = n + lane;
           submit_solution( work, lane_hash, mythr );
        }
-      }
-      *noncev = _mm256_add_epi32( *noncev, eight );
-      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+     }
+     n += 8;
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

-
 #elif defined(LYRA2Z_4WAY)


--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash4 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash5 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash6 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash7 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash4 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash5 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash6 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash7 [16]    __attribute__ ((aligned (32)));
   hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
   __mmask8 vh_mask;
   const __m512i vmask = m512_const1_64( 24 );
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;

 extern void hmq1725_4way_hash(void *state, const void *input)
 {
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
   uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
   hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
   __m256i vh_mask;     
   int h_mask;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -16,7 +16,8 @@

 #if defined (X16R_8WAY)

-// Perform midstate prehash of hash functions with block size <= 72 bytes.
+// Perform midstate prehash of hash functions with block size <= 72 bytes,
+// 76 bytes for hash functions that operate on 32 bit data.

 void x16r_8way_prehash( void *vdata, void *pdata )
 {
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x16r_ctx.luffa, 512 );
-         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                  ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
+         intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                  ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
+                                        ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
 int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t hash4[20] __attribute__ ((aligned (64)));
-   uint32_t hash5[20] __attribute__ ((aligned (64)));
-   uint32_t hash6[20] __attribute__ ((aligned (64)));
-   uint32_t hash7[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (16)));
+   uint32_t hash1[20] __attribute__ ((aligned (16)));
+   uint32_t hash2[20] __attribute__ ((aligned (16)));
+   uint32_t hash3[20] __attribute__ ((aligned (16)));
+   uint32_t hash4[20] __attribute__ ((aligned (16)));
+   uint32_t hash5[20] __attribute__ ((aligned (16)));
+   uint32_t hash6[20] __attribute__ ((aligned (16)));
+   uint32_t hash7[20] __attribute__ ((aligned (16)));
   x16r_8way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
      s_ntime = ntime;

      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_8way_prehash( vdata, pdata );
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         luffa_2way_init( &x16r_ctx.luffa, 512 );
-         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
-         break;
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                                              ctx_luffa.buffer, 512 );
+         intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                                              ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
+      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
 int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16r_4way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_4way_prehash( vdata, pdata );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -30,8 +30,8 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

@@ -84,8 +84,8 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
 int x16rv2_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t hash0[24] __attribute__ ((aligned (32)));
+   uint32_t hash1[24] __attribute__ ((aligned (32)));
+   uint32_t hash2[24] __attribute__ ((aligned (32)));
+   uint32_t hash3[24] __attribute__ ((aligned (32)));
+   uint32_t hash4[24] __attribute__ ((aligned (32)));
+   uint32_t hash5[24] __attribute__ ((aligned (32)));
+   uint32_t hash6[24] __attribute__ ((aligned (32)));
+   uint32_t hash7[24] __attribute__ ((aligned (32)));
   x16rv2_8way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )

 int x16rv2_4way_hash( void* output, const void* input, int thrid )
 {
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16rv2_4way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;
   
-
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     sonoa_8way_context_overlay ctx;

 // 1
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;

 int sonoa_4way_hash( void *state, const void *input, int thr_id )
 {
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     sonoa_4way_context_overlay ctx;

 // 1
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -66,14 +66,14 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     x17_8way_context_overlay ctx;

     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
@@ -327,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

     blake512_4way_full( &ctx.blake, vhash, input, 80 );
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
-     uint64_t hash4[16] __attribute__ ((aligned (64)));
-     uint64_t hash5[16] __attribute__ ((aligned (64)));
-     uint64_t hash6[16] __attribute__ ((aligned (64)));
-     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
+     uint64_t hash4[16] __attribute__ ((aligned (32)));
+     uint64_t hash5[16] __attribute__ ((aligned (32)));
+     uint64_t hash6[16] __attribute__ ((aligned (32)));
+     uint64_t hash7[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

 int xevan_4way_hash( void *output, const void *input, int thr_id )
 {
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.8.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.9.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.8'
-PACKAGE_STRING='cpuminer-opt 3.19.8'
+PACKAGE_VERSION='3.19.9'
+PACKAGE_STRING='cpuminer-opt 3.19.9'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.9 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.9:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.19.8
+cpuminer-opt configure 3.19.9
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.19.8, which was
+It was created by cpuminer-opt $as_me 3.19.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.19.8'
+ VERSION='3.19.9'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.8, which was
+This file was extended by cpuminer-opt $as_me 3.19.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.8
+cpuminer-opt config.status 3.19.9
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.8])
+AC_INIT([cpuminer-opt], [3.19.9])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -812,7 +812,7 @@ Options:\n\
                          lyra2z330     Lyra2 330 rows\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
-                          minotaur      Ringcoin (RNG)\n\
+                          minotaur\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2654,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
   d[10] = s0[5];   d[11] = s1[5];
   d[12] = s0[6];   d[13] = s1[6];
   d[14] = s0[7];   d[15] = s1[7];
+   if ( bit_len <= 1024 ) return;
+   d[16] = s0[8];   d[17] = s1[8];
+   d[18] = s0[9];   d[19] = s1[9];
+   //   if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_2x128_512( void *dst, const void *src0,
@@ -2721,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
   d[20] = s0[5];    d[21] = s1[5];    d[22] = s2[5];    d[23] = s3[5];
   d[24] = s0[6];    d[25] = s1[6];    d[26] = s2[6];    d[27] = s3[6];
   d[28] = s0[7];    d[29] = s1[7];    d[30] = s2[7];    d[31] = s3[7];
+   if ( bit_len <= 1024 ) return;
+   d[32] = s0[8];    d[33] = s1[8];    d[34] = s2[8];    d[35] = s3[8];
+   d[36] = s0[9];    d[37] = s1[9];    d[38] = s2[9];    d[39] = s3[9];
+   // if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_4x128_512( void *dst, const void *src0,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

-// Limited 2 input shuffle
+// Limited 2 input shuffle, combines shuffle with blend. The destination low
+// half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
                                     _mm_castsi128_pd( b ), c ) ); 
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

-//
-// Swap bytes in vector elements, endian bswap.
+// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
+// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
+// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
+// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
+// AVX2 version will work here. The bswap control vector is coded to work
+// with both versions, bit 4 is ignored in AVX2. 
+
+// Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -318,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
 // elements and can be called directly. But they only accept immediate 8
 // for control arg. 
+// The workaround is a fraud, just a fluke of the compiler's optimizer.
+// It fails without -O3. The compiler seems to unroll shift loops, eliminating
+// the variable control, better than rotate loops. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -430,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)

-//
-// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
-//

-// rename plan change ror to vror for Vector ROtate Right,
-// and vrol for Vector ROtate Left, not to be confused with
-//variable rotate rorv, rolv,
-// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
-// operation. 1xNN notaion ia also removed and replaced with simpler NN.
-// Swap will still have its own mnemonic and will be aliased as both
-// left and right shuffles.
-
-// Shift elements right or left in 512 bit vector, filling with zeros.
-// Multiple element shifts can be combined into a single larger
-// element shift.
+// Cross-lane shuffles implementing rotate & shift of elements within a vector.
+//

 #define mm512_shiftr_256( v ) \
  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -530,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 // 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
 #define mm512_shuflr256_128 mm512_swap256_128
 #define mm512_shufll256_128 mm512_swap256_128

@@ -584,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-// Limited 2 input, 1 output shuffle within 128 bit lanes.
+// Limited 2 input, 1 output shuffle, combines shuffle with blend.
+// Like most shuffles it's limited to 128 bit lanes and like some shuffles
+// destination elements must come from a specific source. 
 #define mm512_shuffle2_64( a, b, c ) \
   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
                                           _mm512_castsi512_pd( b ), c ) ); 
@@ -621,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // Drop macros? They can easilly be rebuilt using shufl2 functions

 // 2 input, 1 output
-// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
-// rotated v1 
-// visually confusing for shif2r because of arg order. First arg is always
-// the target for modification, either update by reference or by function
-// return.
+// Rotate concatenated { v1, v2 ) right or left and return v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )

--- a/sysinfos.c
+++ b/sysinfos.c
@@ -502,6 +502,28 @@ static inline bool has_vaes()
 #endif
 }

+static inline bool has_vbmi()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
+#endif
+}
+
+static inline bool has_vbmi2()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
+#endif
+}
+
 // AMD only
 static inline bool has_xop()
 {