v3.20.1

v3.20.0
v3.19.9
2025-09-17 23:44:27 +00:00 · 2022-07-26 18:36:40 -04:00 · 2022-07-17 13:30:50 -04:00 · 2022-07-10 11:04:00 -04:00 · 2022-05-27 18:12:30 -04:00
50 changed files with 2928 additions and 2961 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -289,7 +289,7 @@ cpuminer_SOURCES = \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
-  algo/yespower/crypto/blake2b-yp.c \
+  algo/yespower/crypto/hmac-blake2b.c \
  algo/yespower/yescrypt-r8g.c \
  algo/yespower/yespower-opt.c

--- a/32
+++ b/32
@@ -22,7 +22,7 @@ required.
 Compile Instructions
 --------------------

-See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions

 Requirements
 ------------
@@ -65,6 +65,36 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.20.1
+
+sph_blake2b optimized 1-way SSSE3 & AVX2.
+Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
+Removed imprecise hash & target display from rejected share log.
+Share and target difficulty is now displayed only for low diificulty shares.
+Updated configure.ac to check for AVX512 asm support.
+Small optimization to Lyra2 SSE2.
+
+v3.20.0
+
+#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
+
+v3.19.9
+
+More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
+Relaxed some excessively strict data alignment that was negatively affecting performance.
+
+v3.19.8
+
+#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
+url protocol specifier for requesting a secure stratum connection.
+
+The full url, including the protocol, is now displayed in the stratum connect
+log and the periodic summary log.
+
+Small optimizations to Cubehash, AVX2 & AVX512.
+
+Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
+
 v3.19.7

 #369 Fixed time limited mining, --time-limit.
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -49,6 +49,20 @@ extern "C"{

 #define SPH_SIZE_blake512   512

+/////////////////////////
+//
+//  Blake-256 1 way SSE2
+
+void  blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                             const uint32_t T0, const uint32_t T1 );
+
+/////////////////////////
+//
+//  Blake-512 1 way SSE2
+
+void  blake512_transform_le( uint64_t *H, const uint64_t *buf,
+                             const uint64_t T0, const uint64_t T1 );
+
 //////////////////////////
 //
 //   Blake-256 4 way SSE2
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
 void blake256_8way_close(void *cc, void *dst);
+void blake256_8way_update_le(void *cc, const void *data, size_t len);
+void blake256_8way_close_le(void *cc, void *dst);
+void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data );
+void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data );

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
 void blake512_4way_close( void *cc, void *dst );
 void blake512_4way_full( blake_4way_big_context *sc, void * dst,
                         const void *data, size_t len );
+void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
+                            const void *data, size_t len );
+void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
+                               const void *data );
+void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
+                             const __m256i nonce, const __m256i *midstate );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
 void blake256_16way_init(void *cc);
 void blake256_16way_update(void *cc, const void *data, size_t len);
 void blake256_16way_close(void *cc, void *dst);
+// Expects data in little endian order, no byte swap needed
+void blake256_16way_update_le(void *cc, const void *data, size_t len);
+void blake256_16way_close_le(void *cc, void *dst);
+void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data );
+void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data );
+

 // 14 rounds, blake, decred
 typedef blake_16way_small_context blake256r14_16way_context;
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_full( blake_8way_big_context *sc, void * dst,
                        const void *data, size_t len );
-void blake512_8way_hash_le80( void *hash, const void *data );
+void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
+                            const void *data, size_t len );
+void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
+                               const void *data );
+void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
+                             const __m512i nonce, const __m512i *midstate );

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -5,6 +5,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *               2016-2022  JayDDee246@gmail.com
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {

 #endif

+/////////////////////////////////////////
+//
+// Blake-256 1 way SIMD
+
+#define BLAKE256_ROUND( r ) \
+{ \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
+                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
+                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
+                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
+                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shufll_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shuflr_32( V1 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
+                                          CSx( r, D ) ^ Mx( r, C ), \
+                                          CSx( r, B ) ^ Mx( r, A ), \
+                                          CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
+                                          CSx( r, C ) ^ Mx( r, D ), \
+                                          CSx( r, A ) ^ Mx( r, B ), \
+                                          CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shuflr_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shufll_32( V1 ); \
+}
+
+void blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                            const uint32_t T0, const uint32_t T1 )
+{
+   __m128i V0, V1, V2, V3;
+   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+   V0 = casti_m128i( H, 0 );
+   V1 = casti_m128i( H, 1 );
+   V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+                       T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
+   M0 = buf[ 0];
+   M1 = buf[ 1];
+   M2 = buf[ 2];
+   M3 = buf[ 3];
+   M4 = buf[ 4];
+   M5 = buf[ 5];
+   M6 = buf[ 6];
+   M7 = buf[ 7];
+   M8 = buf[ 8];
+   M9 = buf[ 9];
+   MA = buf[10];
+   MB = buf[11];
+   MC = buf[12];
+   MD = buf[13];
+   ME = buf[14];
+   MF = buf[15];
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   BLAKE256_ROUND( 4 );
+   BLAKE256_ROUND( 5 );
+   BLAKE256_ROUND( 6 );
+   BLAKE256_ROUND( 7 );
+   BLAKE256_ROUND( 8 );
+   BLAKE256_ROUND( 9 );
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
+   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
+}
+
+////////////////////////////////////////////
+//
 // Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -508,14 +601,10 @@ do { \
   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
   VA = m128_const1_64( 0x13198A2E13198A2E ); \
   VB = m128_const1_64( 0x0370734403707344 ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
-                           m128_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
-                           m128_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
-                           m128_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
-                           m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
@@ -548,6 +637,8 @@ do { \

 #if defined (__AVX2__)

+/////////////////////////////////
+//
 // Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -626,14 +717,10 @@ do { \
   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
   VA = m256_const1_64( 0x13198A2E13198A2E ); \
   VB = m256_const1_64( 0x0370734403707344 ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
-                              m256_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
-                              m256_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
@@ -679,13 +766,247 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)

+#define COMPRESS32_8WAY_LE( rounds ) \
+do { \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
+   VA = m256_const1_64( 0x13198A2E13198A2E ); \
+   VB = m256_const1_64( 0x0370734403707344 ); \
+   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+   ROUND_S_8WAY(0); \
+   ROUND_S_8WAY(1); \
+   ROUND_S_8WAY(2); \
+   ROUND_S_8WAY(3); \
+   ROUND_S_8WAY(4); \
+   ROUND_S_8WAY(5); \
+   ROUND_S_8WAY(6); \
+   ROUND_S_8WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_8WAY(8); \
+      ROUND_S_8WAY(9); \
+      ROUND_S_8WAY(0); \
+      ROUND_S_8WAY(1); \
+      ROUND_S_8WAY(2); \
+      ROUND_S_8WAY(3); \
+   } \
+   H0 = mm256_xor3( V8, V0, H0 ); \
+   H1 = mm256_xor3( V9, V1, H1 ); \
+   H2 = mm256_xor3( VA, V2, H2 ); \
+   H3 = mm256_xor3( VB, V3, H3 ); \
+   H4 = mm256_xor3( VC, V4, H4 ); \
+   H5 = mm256_xor3( VD, V5, H5 ); \
+   H6 = mm256_xor3( VE, V6, H6 ); \
+   H7 = mm256_xor3( VF, V7, H7 ); \
+} while (0)
+
+void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data )
+{
+   const __m256i *M = (const __m256i*)data;
+   __m256i *V = (__m256i*)midstate;
+   const __m256i *H = (const __m256i*)midhash;
+
+   V[ 0] = H[0];
+   V[ 1] = H[1];
+   V[ 2] = H[2];
+   V[ 3] = H[3];
+   V[ 4] = H[4];
+   V[ 5] = H[5];
+   V[ 6] = H[6];
+   V[ 7] = H[7];
+   V[ 8] = m256_const1_32( CS0 );
+   V[ 9] = m256_const1_32( CS1 );
+   V[10] = m256_const1_32( CS2 );
+   V[11] = m256_const1_32( CS3 );
+   V[12] = m256_const1_32( CS4 ^ 0x280 );
+   V[13] = m256_const1_32( CS5 ^ 0x280 );
+   V[14] = m256_const1_32( CS6 );
+   V[15] = m256_const1_32( CS7 );
+
+   // G0   
+   GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+
+   // G1   
+   V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
+   V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
+   V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
+   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
+   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
+
+   // G2,G3
+   GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+
+   // G4   
+   V[ 0] = _mm256_add_epi32( V[ 0],
+                         _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
+
+   // G6   
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
+
+   // G7   
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm256_add_epi32( V[ 3],
+                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
+}
+
+void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data )
+{
+   __m256i *H = (__m256i*)final_hash;
+   const __m256i *h = (const __m256i*)midhash;
+   const __m256i *v= (const __m256i*)midstate;
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+
+   V0 = v[ 0];
+   V1 = v[ 1];
+   V2 = v[ 2];
+   V3 = v[ 3];
+   V4 = v[ 4];
+   V5 = v[ 5];
+   V6 = v[ 6];
+   V7 = v[ 7];
+   V8 = v[ 8];
+   V9 = v[ 9];
+   VA = v[10];
+   VB = v[11];
+   VC = v[12];
+   VD = v[13];
+   VE = v[14];
+   VF = v[15];
+
+   M0 = casti_m256i( data,  0 );
+   M1 = casti_m256i( data,  1 );
+   M2 = casti_m256i( data,  2 );
+   M3 = casti_m256i( data,  3 );
+   M4 = casti_m256i( data,  4 );
+   M5 = casti_m256i( data,  5 );
+   M6 = casti_m256i( data,  6 );
+   M7 = casti_m256i( data,  7 );
+   M8 = casti_m256i( data,  8 );
+   M9 = casti_m256i( data,  9 );
+   MA = casti_m256i( data, 10 );
+   MB = casti_m256i( data, 11 );
+   MC = casti_m256i( data, 12 );
+   MD = casti_m256i( data, 13 );
+   ME = casti_m256i( data, 14 );
+   MF = casti_m256i( data, 15 );
+
+   // Finish round 0   
+   // G1   
+   V1 = _mm256_add_epi32( V1,
+                         _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
+   V9 = _mm256_add_epi32( V9, VD );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
+
+   // G4
+   V0 = _mm256_add_epi32( V0, V5 );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
+   VA = _mm256_add_epi32( VA, VF );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
+   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
+                         _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
+   VA = _mm256_add_epi32( VA, VF );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
+
+   // G5
+   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+
+   // G6
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
+   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
+
+   // G7
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
+   V3 = _mm256_add_epi32( V3, V4 );
+   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
+
+   // Remaining rounds   
+   ROUND_S_8WAY( 1 );
+   ROUND_S_8WAY( 2 );
+   ROUND_S_8WAY( 3 );
+   ROUND_S_8WAY( 4 );
+   ROUND_S_8WAY( 5 );
+   ROUND_S_8WAY( 6 );
+   ROUND_S_8WAY( 7 );
+   ROUND_S_8WAY( 8 );
+   ROUND_S_8WAY( 9 );
+   ROUND_S_8WAY( 0 );
+   ROUND_S_8WAY( 1 );
+   ROUND_S_8WAY( 2 );
+   ROUND_S_8WAY( 3 );
+
+   const __m256i shuf_bswap32 =
+                  m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+}

 #endif

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-// Blaske-256 16 way AVX512
+///////////////////////////////////////
+//
+// Blake-256 16 way AVX512

 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
@@ -763,14 +1084,10 @@ do { \
   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
   VA = m512_const1_64( 0x13198A2E13198A2E ); \
   VB = m512_const1_64( 0x0370734403707344 ); \
-   VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
-                              m512_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
-                              m512_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
-                              m512_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
-                              m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
@@ -818,6 +1135,264 @@ do { \
   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)

+#define COMPRESS32_16WAY_LE( rounds ) \
+do { \
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
+   VA = m512_const1_64( 0x13198A2E13198A2E ); \
+   VB = m512_const1_64( 0x0370734403707344 ); \
+   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+   ROUND_S_16WAY(0); \
+   ROUND_S_16WAY(1); \
+   ROUND_S_16WAY(2); \
+   ROUND_S_16WAY(3); \
+   ROUND_S_16WAY(4); \
+   ROUND_S_16WAY(5); \
+   ROUND_S_16WAY(6); \
+   ROUND_S_16WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_16WAY(8); \
+      ROUND_S_16WAY(9); \
+      ROUND_S_16WAY(0); \
+      ROUND_S_16WAY(1); \
+      ROUND_S_16WAY(2); \
+      ROUND_S_16WAY(3); \
+   } \
+   H0 = mm512_xor3( V8, V0, H0 ); \
+   H1 = mm512_xor3( V9, V1, H1 ); \
+   H2 = mm512_xor3( VA, V2, H2 ); \
+   H3 = mm512_xor3( VB, V3, H3 ); \
+   H4 = mm512_xor3( VC, V4, H4 ); \
+   H5 = mm512_xor3( VD, V5, H5 ); \
+   H6 = mm512_xor3( VE, V6, H6 ); \
+   H7 = mm512_xor3( VF, V7, H7 ); \
+} while (0)
+
+// Blake-256 prehash of the second block is split onto 2 parts. The first part
+// is constant for every nonce and only needs to be run once per job. The
+// second part is run for each nonce using the precalculated midstate and the
+// hash from the first block.
+void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data )
+{
+   const __m512i *M = (const __m512i*)data;
+   __m512i *V = (__m512i*)midstate;
+   const __m512i *H = (const __m512i*)midhash;
+
+   V[ 0] = H[0];
+   V[ 1] = H[1];
+   V[ 2] = H[2];
+   V[ 3] = H[3];
+   V[ 4] = H[4];
+   V[ 5] = H[5];
+   V[ 6] = H[6];
+   V[ 7] = H[7];
+   V[ 8] = m512_const1_32( CS0 );
+   V[ 9] = m512_const1_32( CS1 );
+   V[10] = m512_const1_32( CS2 );
+   V[11] = m512_const1_32( CS3 );
+   V[12] = m512_const1_32( CS4 ^ 0x280 );
+   V[13] = m512_const1_32( CS5 ^ 0x280 );
+   V[14] = m512_const1_32( CS6 );
+   V[15] = m512_const1_32( CS7 );
+
+   // G0   
+   GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+
+   // G1, nonce is in M[3]   
+   // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
+   V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
+   V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
+   V[ 9] = _mm512_add_epi32( V[ 9], V[13] );
+   V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
+   V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
+
+   // G2,G3
+   GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+
+   // G4   
+   // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
+   V[ 0] = _mm512_add_epi32( V[ 0],
+                         _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); 
+   
+   // G5
+   // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
+
+   // G6   
+   // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
+   // G7   
+   // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm512_add_epi32( V[ 3],
+                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
+}
+
+void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data )
+{
+   __m512i *H = (__m512i*)final_hash;
+   const __m512i *h = (const __m512i*)midhash;
+   const __m512i *v= (const __m512i*)midstate;
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+
+   V0 = v[ 0];
+   V1 = v[ 1];
+   V2 = v[ 2];
+   V3 = v[ 3];
+   V4 = v[ 4];
+   V5 = v[ 5];
+   V6 = v[ 6];
+   V7 = v[ 7];
+   V8 = v[ 8];
+   V9 = v[ 9];
+   VA = v[10];
+   VB = v[11];
+   VC = v[12];
+   VD = v[13];
+   VE = v[14];
+   VF = v[15];
+
+   M0 = casti_m512i( data,  0 ); 
+   M1 = casti_m512i( data,  1 ); 
+   M2 = casti_m512i( data,  2 ); 
+   M3 = casti_m512i( data,  3 ); 
+   M4 = casti_m512i( data,  4 ); 
+   M5 = casti_m512i( data,  5 ); 
+   M6 = casti_m512i( data,  6 ); 
+   M7 = casti_m512i( data,  7 ); 
+   M8 = casti_m512i( data,  8 ); 
+   M9 = casti_m512i( data,  9 ); 
+   MA = casti_m512i( data, 10 ); 
+   MB = casti_m512i( data, 11 ); 
+   MC = casti_m512i( data, 12 ); 
+   MD = casti_m512i( data, 13 ); 
+   ME = casti_m512i( data, 14 ); 
+   MF = casti_m512i( data, 15 ); 
+
+   // Finish round 0 with the nonce (M3) now available
+   // G0   
+   // GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );
+
+   // G1   
+   // GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
+   V1 = _mm512_add_epi32( V1, 
+                         _mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
+   V9 = _mm512_add_epi32( V9, VD );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 );
+   
+   // G2,G3   
+   // GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE );
+   // GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF );
+
+   // G4
+   // GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
+   V0 = _mm512_add_epi32( V0, V5 );
+   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
+   VA = _mm512_add_epi32( VA, VF );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
+   V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
+                         _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
+   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
+   VA = _mm512_add_epi32( VA, VF );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
+
+   // G5
+   GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+
+   // G6
+   // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
+   V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
+
+   // G7
+   // GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
+   V3 = _mm512_add_epi32( V3, V4 );
+   VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
+
+   // Remaining rounds   
+   ROUND_S_16WAY( 1 );
+   ROUND_S_16WAY( 2 );
+   ROUND_S_16WAY( 3 );
+   ROUND_S_16WAY( 4 );
+   ROUND_S_16WAY( 5 );
+   ROUND_S_16WAY( 6 );
+   ROUND_S_16WAY( 7 );
+   ROUND_S_16WAY( 8 );
+   ROUND_S_16WAY( 9 );
+   ROUND_S_16WAY( 0 );
+   ROUND_S_16WAY( 1 );
+   ROUND_S_16WAY( 2 );
+   ROUND_S_16WAY( 3 );
+
+   // Byte swap final hash
+   const __m512i shuf_bswap32 =
+                  m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                 0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                 0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+}
+
 #endif

 // Blake-256 4 way
@@ -913,8 +1488,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
                                m128_const1_64( 0x0100000001000000ULL ) );
-      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
-      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
+      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
   }
   else
@@ -926,8 +1501,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      memset_zero_128( buf, 56>>2 );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
                                m128_const1_64( 0x0100000001000000ULL ) );
-      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
-      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
+      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf, 64 );
   }

@@ -1033,22 +1608,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
                                m256_const1_64( 0x0100000001000000ULL ) );
-       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
+       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
-        sc->T0 = SPH_C32(0xFFFFFE00UL);
-        sc->T1 = SPH_C32(0xFFFFFFFFUL);
-        memset_zero_256( buf, 56>>2 );
+       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = SPH_C32(0xFFFFFE00UL);
+       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
-        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
-        blake32_8way( sc, buf, 64 );
+       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
+       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
+       blake32_8way( sc, buf, 64 );
+   }
+   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
+}
+
+static void
+blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_8WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+
+   READ_STATE32_8WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+                T1 = SPH_T32(T1 + 1);
+          COMPRESS32_8WAY_LE( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_8WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+                       void *dst, size_t out_size_w32 )
+{
+   __m256i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m256_const1_32( 0x80000000 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
+        sc->T1 = SPH_T32(sc->T1 - 1);
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 );
+       *(buf+(56>>2)) = _mm256_set1_epi32( th );
+       *(buf+(60>>2)) = _mm256_set1_epi32( tl );
+       blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = SPH_C32(0xFFFFFE00UL);
+       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       memset_zero_256( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = m256_one_32;
+       *(buf+(56>>2)) = _mm256_set1_epi32( th );
+       *(buf+(60>>2)) = _mm256_set1_epi32( tl );
+       blake32_8way_le( sc, buf, 64 );
   }
   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
 }
@@ -1117,7 +1787,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   WRITE_STATE32_16WAY(sc);
   sc->ptr = ptr;
 }
-
 static void
 blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
@@ -1152,22 +1821,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm512_or_si512( buf[52>>2],
                                m512_const1_64( 0x0100000001000000ULL ) );
-       buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
-       buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
+       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+       memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
+       memset_zero_512( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+          buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
+       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
+       blake32_16way( sc, buf, 64 );
+   }
+   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+}
+
+static void
+blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_16WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   // only if calling update with 80
+   if ( len < buf_size - ptr )
+   {
+        memcpy_512( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+   READ_STATE32_16WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
+          COMPRESS32_16WAY_LE( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_16WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m512_const1_32( 0x80000000 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
        sc->T0 = 0xFFFFFE00UL;
        sc->T1 = 0xFFFFFFFFUL;
-        memset_zero_512( buf, 56>>2 );
-       if ( out_size_w32 == 8 )
-           buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
-        buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
-        buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
-        blake32_16way( sc, buf, 64 );
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 );
+       buf[56>>2] = _mm512_set1_epi32( th );
+       buf[60>>2] = _mm512_set1_epi32( tl );
+       blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
+       memset_zero_512( buf, 56>>2 );
+       buf[52>>2] = m512_one_32;
+       buf[56>>2] = _mm512_set1_epi32( th );
+       buf[60>>2] = _mm512_set1_epi32( tl );
+       blake32_16way_le( sc, buf, 64 );
   }
   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
 }
@@ -1190,6 +1953,18 @@ blake256_16way_close(void *cc, void *dst)
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

+void
+blake256_16way_update_le(void *cc, const void *data, size_t len)
+{
+   blake32_16way_le(cc, data, len);
+}
+
+void
+blake256_16way_close_le(void *cc, void *dst)
+{
+    blake32_16way_close_le(cc, 0, 0, dst, 8);
+}
+
 void blake256r14_16way_init(void *cc)
 {
   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
@@ -1271,6 +2046,18 @@ blake256_8way_close(void *cc, void *dst)
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

+void
+blake256_8way_update_le(void *cc, const void *data, size_t len)
+{
+        blake32_8way_le(cc, data, len);
+}
+
+void
+blake256_8way_close_le(void *cc, void *dst)
+{
+        blake32_8way_close_le(cc, 0, 0, dst, 8);
+}
+
 #endif

 // 14 rounds Blake, Decred
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
 };


+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define B2B8W_G(a, b, c, d, x, y) \
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -361,14 +361,10 @@ static const sph_u64 CB[16] = {
  V9 = m512_const1_64( CB1 );  \
  VA = m512_const1_64( CB2 );  \
  VB = m512_const1_64( CB3 );  \
-  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
-                         m512_const1_64( CB4 ) );  \
-  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
-                         m512_const1_64( CB5 ) );  \
-  VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
-                         m512_const1_64( CB6 ) );  \
-  VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
-                         m512_const1_64( CB7 ) );  \
+  VC = _mm512_set1_epi64( T0 ^ CB4 ); \
+  VD = _mm512_set1_epi64( T0 ^ CB5 ); \
+  VE = _mm512_set1_epi64( T1 ^ CB6 ); \
+  VF = _mm512_set1_epi64( T1 ^ CB7 ); \
  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  V9 = m512_const1_64( CB1 );
  VA = m512_const1_64( CB2 );
  VB = m512_const1_64( CB3 );
-  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB4 ) );
-  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB5 ) );
-  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB6 ) );
-  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB7 ) );
+  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
+  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
+  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
+  VF = _mm512_set1_epi64( sc->T1 ^ CB7 );

  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
                                0x28292a2b2c2d2e2f, 0x2021222324252627,
@@ -493,6 +485,307 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
 }

+// won't be used after prehash implemented
+void blake512_8way_compress_le( blake_8way_big_context *sc )
+{
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = m512_const1_64( CB0 );
+  V9 = m512_const1_64( CB1 );
+  VA = m512_const1_64( CB2 );
+  VB = m512_const1_64( CB3 );
+  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
+  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
+  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
+  VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
+
+  M0 = sc->buf[ 0];
+  M1 = sc->buf[ 1];
+  M2 = sc->buf[ 2];
+  M3 = sc->buf[ 3];
+  M4 = sc->buf[ 4];
+  M5 = sc->buf[ 5];
+  M6 = sc->buf[ 6];
+  M7 = sc->buf[ 7];
+  M8 = sc->buf[ 8];
+  M9 = sc->buf[ 9];
+  MA = sc->buf[10];
+  MB = sc->buf[11];
+  MC = sc->buf[12];
+  MD = sc->buf[13];
+  ME = sc->buf[14];
+  MF = sc->buf[15];
+
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+  ROUND_B_8WAY(6);
+  ROUND_B_8WAY(7);
+  ROUND_B_8WAY(8);
+  ROUND_B_8WAY(9);
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+
+  sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
+}
+
+// with final_le forms a full hash in 2 parts from little endian data.
+// all variables hard coded for 80 bytes/lane.
+void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
+                               const void *data )
+{
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+
+   // initial hash
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   // fill buffer
+   memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
+   sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
+   sc->buf[11] = 
+   sc->buf[12] = m512_zero;
+   sc->buf[13] = m512_one_64;
+   sc->buf[14] = m512_zero;
+   sc->buf[15] = m512_const1_64( 80*8 );
+
+   // build working variables
+   V0 = sc->H[0];
+   V1 = sc->H[1];
+   V2 = sc->H[2];
+   V3 = sc->H[3];
+   V4 = sc->H[4];
+   V5 = sc->H[5];
+   V6 = sc->H[6];
+   V7 = sc->H[7];
+   V8 = m512_const1_64( CB0 );
+   V9 = m512_const1_64( CB1 );
+   VA = m512_const1_64( CB2 );
+   VB = m512_const1_64( CB3 );
+   VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
+   VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
+   VE = _mm512_set1_epi64( CB6 );
+   VF = _mm512_set1_epi64( CB7 );
+
+   // round 0
+   GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
+   GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
+   GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
+   GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
+
+   // Do half of G4, skip the nonce
+   // GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
+
+   V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
+                       _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); 
+   VA = _mm512_add_epi64( VA, VF ); 
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
+   V0 = _mm512_add_epi64( V0, V5 );
+   
+   GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
+   GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
+   GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
+   
+   // round 1
+   // G1   
+//   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
+   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+   V2 = _mm512_add_epi64( V2, V6 ); 
+
+   // G3
+//   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
+
+   // save midstate for second part
+   midstate[ 0] = V0;
+   midstate[ 1] = V1;
+   midstate[ 2] = V2;
+   midstate[ 3] = V3;
+   midstate[ 4] = V4;
+   midstate[ 5] = V5;
+   midstate[ 6] = V6;
+   midstate[ 7] = V7;
+   midstate[ 8] = V8;
+   midstate[ 9] = V9;
+   midstate[10] = VA;
+   midstate[11] = VB;
+   midstate[12] = VC;
+   midstate[13] = VD;
+   midstate[14] = VE;
+   midstate[15] = VF;
+} 
+
+// pick up where we left off, need the nonce now.
+void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
+                             const __m512i nonce, const __m512i *midstate )
+{
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m512i h[8] __attribute__ ((aligned (64)));
+
+   // Load data with new nonce
+   M0 = sc->buf[ 0];
+   M1 = sc->buf[ 1];
+   M2 = sc->buf[ 2];
+   M3 = sc->buf[ 3];
+   M4 = sc->buf[ 4];
+   M5 = sc->buf[ 5];
+   M6 = sc->buf[ 6];
+   M7 = sc->buf[ 7];
+   M8 = sc->buf[ 8];
+   M9 = nonce;
+   MA = sc->buf[10];
+   MB = sc->buf[11];
+   MC = sc->buf[12];
+   MD = sc->buf[13];
+   ME = sc->buf[14];
+   MF = sc->buf[15];
+
+   V0 = midstate[ 0];
+   V1 = midstate[ 1];
+   V2 = midstate[ 2];
+   V3 = midstate[ 3];
+   V4 = midstate[ 4];
+   V5 = midstate[ 5];
+   V6 = midstate[ 6];
+   V7 = midstate[ 7];
+   V8 = midstate[ 8];
+   V9 = midstate[ 9];
+   VA = midstate[10];
+   VB = midstate[11];
+   VC = midstate[12];
+   VD = midstate[13];
+   VE = midstate[14];
+   VF = midstate[15];
+
+   // finish round 0 with the nonce now available 
+   V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
+                                       _mm512_set1_epi64( CB8 ), M9 ) ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
+   VA = _mm512_add_epi64( VA, VF );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
+  
+   // Round 1
+   // G0
+   GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+//   GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
+//   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
+
+   V1 = _mm512_add_epi64( V1, V5 );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
+   V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+//   V2 = _mm512_add_epi64( V2, V6 );
+   V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
+                 _mm512_set1_epi64( CBF ), M9 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
+   V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB9 ), MF ), V6 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
+
+   // G3
+//   GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
+//   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
+//                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 
+
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
+   // remaining rounds  
+   ROUND_B_8WAY(2);
+   ROUND_B_8WAY(3);
+   ROUND_B_8WAY(4);
+   ROUND_B_8WAY(5);
+   ROUND_B_8WAY(6);
+   ROUND_B_8WAY(7);
+   ROUND_B_8WAY(8);
+   ROUND_B_8WAY(9);
+   ROUND_B_8WAY(0);
+   ROUND_B_8WAY(1);
+   ROUND_B_8WAY(2);
+   ROUND_B_8WAY(3);
+   ROUND_B_8WAY(4);
+   ROUND_B_8WAY(5);
+
+   h[0] = mm512_xor3( V8, V0, sc->H[0] );
+   h[1] = mm512_xor3( V9, V1, sc->H[1] );
+   h[2] = mm512_xor3( VA, V2, sc->H[2] );
+   h[3] = mm512_xor3( VB, V3, sc->H[3] );
+   h[4] = mm512_xor3( VC, V4, sc->H[4] );
+   h[5] = mm512_xor3( VD, V5, sc->H[5] );
+   h[6] = mm512_xor3( VE, V6, sc->H[6] );
+   h[7] = mm512_xor3( VF, V7, sc->H[7] );
+
+   // bswap final hash
+   mm512_block_bswap_64( (__m512i*)hash, h );
+}
+
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -678,6 +971,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
   
+void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
+                        const void *data, size_t len )
+{
+
+// init
+
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
+   sc->ptr = len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+            sc->T1 = sc->T1 + 1;
+      blake512_8way_compress_le( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr64 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
+   }
+   else
+      sc->T0 -= 1024 - bit_len;
+
+   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m512_one_64;
+   sc->buf[14] = m512_const1_64( th );
+   sc->buf[15] = m512_const1_64( tl );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_8way_compress_le( sc );
+
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -741,14 +1101,10 @@ blake512_8way_close(void *cc, void *dst)
  V9 = m256_const1_64( CB1 );  \
  VA = m256_const1_64( CB2 );  \
  VB = m256_const1_64( CB3 );  \
-  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         m256_const1_64( CB4 ) );  \
-  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         m256_const1_64( CB5 ) );  \
-  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         m256_const1_64( CB6 ) );  \
-  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         m256_const1_64( CB7 ) );  \
+  VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
+  VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
+  VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
+  VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -869,6 +1225,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

+void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
+                               const void *data )
+{
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+
+   // initial hash
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+   
+   // fill buffer
+   memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
+   sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
+   sc->buf[11] = m256_zero;
+   sc->buf[12] = m256_zero;
+   sc->buf[13] = m256_one_64;
+   sc->buf[14] = m256_zero;
+   sc->buf[15] = m256_const1_64( 80*8 );
+
+   // build working variables
+   V0 = sc->H[0];
+   V1 = sc->H[1];
+   V2 = sc->H[2];
+   V3 = sc->H[3];
+   V4 = sc->H[4];
+   V5 = sc->H[5];
+   V6 = sc->H[6];
+   V7 = sc->H[7];
+   V8 = m256_const1_64( CB0 );
+   V9 = m256_const1_64( CB1 );
+   VA = m256_const1_64( CB2 );
+   VB = m256_const1_64( CB3 );
+   VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
+   VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
+   VE = _mm256_set1_epi64x( CB6 );
+   VF = _mm256_set1_epi64x( CB7 );
+
+   // round 0
+   GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
+   GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
+   GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
+   GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
+
+   // G4 skip nonce
+   V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
+                       _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
+   VA = _mm256_add_epi64( VA, VF );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
+   V0 = _mm256_add_epi64( V0, V5 );
+
+   GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
+   GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
+   GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
+
+   // round 1
+   // G1   
+   V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, V6 );
+
+   // G3
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
+
+   // save midstate for second part
+   midstate[ 0] = V0;
+   midstate[ 1] = V1;
+   midstate[ 2] = V2;
+   midstate[ 3] = V3;
+   midstate[ 4] = V4;
+   midstate[ 5] = V5;
+   midstate[ 6] = V6;
+   midstate[ 7] = V7;
+   midstate[ 8] = V8;
+   midstate[ 9] = V9;
+   midstate[10] = VA;
+   midstate[11] = VB;
+   midstate[12] = VC;
+   midstate[13] = VD;
+   midstate[14] = VE;
+   midstate[15] = VF;
+}
+
+void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
+                             const __m256i nonce, const __m256i *midstate )
+{
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m256i h[8] __attribute__ ((aligned (64)));
+
+   // Load data with new nonce
+   M0 = sc->buf[ 0];
+   M1 = sc->buf[ 1];
+   M2 = sc->buf[ 2];
+   M3 = sc->buf[ 3];
+   M4 = sc->buf[ 4];
+   M5 = sc->buf[ 5];
+   M6 = sc->buf[ 6];
+   M7 = sc->buf[ 7];
+   M8 = sc->buf[ 8];
+   M9 = nonce;
+   MA = sc->buf[10];
+   MB = sc->buf[11];
+   MC = sc->buf[12];
+   MD = sc->buf[13];
+   ME = sc->buf[14];
+   MF = sc->buf[15];
+
+   V0 = midstate[ 0];
+   V1 = midstate[ 1];
+   V2 = midstate[ 2];
+   V3 = midstate[ 3];
+   V4 = midstate[ 4];
+   V5 = midstate[ 5];
+   V6 = midstate[ 6];
+   V7 = midstate[ 7];
+   V8 = midstate[ 8];
+   V9 = midstate[ 9];
+   VA = midstate[10];
+   VB = midstate[11];
+   VC = midstate[12];
+   VD = midstate[13];
+   VE = midstate[14];
+   VF = midstate[15];
+
+   // finish round 0, with the nonce now available 
+   V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
+                                       _mm256_set1_epi64x( CB8 ), M9 ) );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
+   VA = _mm256_add_epi64( VA, VF );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
+
+   // Round 1
+   // G0
+   GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+   V1 = _mm256_add_epi64( V1, V5 );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
+   V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBF ), M9 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
+   V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
+
+   // G3
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
+   ROUND_B_4WAY(2);
+   ROUND_B_4WAY(3);
+   ROUND_B_4WAY(4);
+   ROUND_B_4WAY(5);
+   ROUND_B_4WAY(6);
+   ROUND_B_4WAY(7);
+   ROUND_B_4WAY(8);
+   ROUND_B_4WAY(9);
+   ROUND_B_4WAY(0);
+   ROUND_B_4WAY(1);
+   ROUND_B_4WAY(2);
+   ROUND_B_4WAY(3);
+   ROUND_B_4WAY(4);
+   ROUND_B_4WAY(5);
+
+   h[0] = mm256_xor3( V8, V0, sc->H[0] );
+   h[1] = mm256_xor3( V9, V1, sc->H[1] );
+   h[2] = mm256_xor3( VA, V2, sc->H[2] );
+   h[3] = mm256_xor3( VB, V3, sc->H[3] );
+   h[4] = mm256_xor3( VC, V4, sc->H[4] );
+   h[5] = mm256_xor3( VD, V5, sc->H[5] );
+   h[6] = mm256_xor3( VE, V6, sc->H[6] );
+   h[7] = mm256_xor3( VF, V7, sc->H[7] );
+
+   // bswap final hash
+   mm256_block_bswap_64( (__m256i*)hash, h );
+}
+
+
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
 		H7 ^= S3 ^ V7 ^ VF; \
 	} while (0)

+#define COMPRESS32_LE   do { \
+      sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+      sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+      sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+      sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+      V0 = H0; \
+      V1 = H1; \
+      V2 = H2; \
+      V3 = H3; \
+      V4 = H4; \
+      V5 = H5; \
+      V6 = H6; \
+      V7 = H7; \
+      V8 = S0 ^ CS0; \
+      V9 = S1 ^ CS1; \
+      VA = S2 ^ CS2; \
+      VB = S3 ^ CS3; \
+      VC = T0 ^ CS4; \
+      VD = T0 ^ CS5; \
+      VE = T1 ^ CS6; \
+      VF = T1 ^ CS7; \
+      M0 = *((uint32_t*)(buf +  0)); \
+      M1 = *((uint32_t*)(buf +  4)); \
+      M2 = *((uint32_t*)(buf +  8)); \
+      M3 = *((uint32_t*)(buf + 12)); \
+      M4 = *((uint32_t*)(buf + 16)); \
+      M5 = *((uint32_t*)(buf + 20)); \
+      M6 = *((uint32_t*)(buf + 24)); \
+      M7 = *((uint32_t*)(buf + 28)); \
+      M8 = *((uint32_t*)(buf + 32)); \
+      M9 = *((uint32_t*)(buf + 36)); \
+      MA = *((uint32_t*)(buf + 40)); \
+      MB = *((uint32_t*)(buf + 44)); \
+      MC = *((uint32_t*)(buf + 48)); \
+      MD = *((uint32_t*)(buf + 52)); \
+      ME = *((uint32_t*)(buf + 56)); \
+      MF = *((uint32_t*)(buf + 60)); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      ROUND_S(4); \
+      ROUND_S(5); \
+      ROUND_S(6); \
+      ROUND_S(7); \
+      if (BLAKE32_ROUNDS == 14) { \
+      ROUND_S(8); \
+      ROUND_S(9); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      } \
+      H0 ^= S0 ^ V0 ^ V8; \
+      H1 ^= S1 ^ V1 ^ V9; \
+      H2 ^= S2 ^ V2 ^ VA; \
+      H3 ^= S3 ^ V3 ^ VB; \
+      H4 ^= S0 ^ V4 ^ VC; \
+      H5 ^= S1 ^ V5 ^ VD; \
+      H6 ^= S2 ^ V6 ^ VE; \
+      H7 ^= S3 ^ V7 ^ VF; \
+   } while (0)
+
 #endif

 #if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
 	sc->ptr = ptr;
 }

+static void
+blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
+{
+   unsigned char *buf;
+   size_t ptr;
+   DECL_STATE32
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if (len < (sizeof sc->buf) - ptr) {
+      memcpy(buf + ptr, data, len);
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE32(sc);
+   while (len > 0) {
+      size_t clen;
+
+      clen = (sizeof sc->buf) - ptr;
+      if (clen > len)
+         clen = len;
+      memcpy(buf + ptr, data, clen);
+      ptr += clen;
+      data = (const unsigned char *)data + clen;
+      len -= clen;
+      if (ptr == sizeof sc->buf) {
+         if ((T0 = SPH_T32(T0 + 512)) < 512)
+            T1 = SPH_T32(T1 + 1);
+         COMPRESS32_LE;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE32(sc);
+   sc->ptr = ptr;
+}
+
 static void
 blake32_close(sph_blake_small_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
 	blake32(cc, data, len);
 }

+void
+sph_blake256_update_le(void *cc, const void *data, size_t len)
+{
+   blake32_le(cc, data, len);
+}
+
 /* see sph_blake.h */
 void
 sph_blake256_close(void *cc, void *dst)
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
 * @param len    the input data length (in bytes)
 */
 void sph_blake256(void *cc, const void *data, size_t len);
+void sph_blake256_update_le(void *cc, const void *data, size_t len);

 /**
 * Terminate the current BLAKE-256 computation and output the result into
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -30,16 +30,10 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
+#include "simd-utils.h"
 #include "algo/sha/sph_types.h"
 #include "sph_blake2b.h"

-// Cyclic right rotation.
-
-#ifndef ROTR64
-#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-#endif
-
 // Little-endian byte access.

 #define B2B_GET64(p)                            \
@@ -54,45 +48,131 @@

 // G Mixing function.

-#define B2B_G(a, b, c, d, x, y) {   \
-	v[a] = v[a] + v[b] + x;         \
-	v[d] = ROTR64(v[d] ^ v[a], 32); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 24); \
-	v[a] = v[a] + v[b] + y;         \
-	v[d] = ROTR64(v[d] ^ v[a], 16); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 63); }
+#if defined(__AVX2__)
+
+#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
+{ \
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
+                                 m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
+  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+  __m256i *V = (__m256i*)v; \
+  BLAKE2B_G( R,  0,  2,  4,  6, 32, 24 ); \
+  BLAKE2B_G( R,  1,  3,  5,  7, 16, 63 ); \
+  V[3] = mm256_shufll_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shuflr_64( V[1] ); \
+  BLAKE2B_G( R,  8, 10, 12, 14, 32, 24 ); \
+  BLAKE2B_G( R,  9, 11, 13, 15, 16, 63 ); \
+  V[3] = mm256_shuflr_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shufll_64( V[1] ); \
+}
+
+#elif defined(__SSSE3__)
+
+#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
+{ \
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
+   Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   __m128i V2, V3, V6, V7; \
+   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
+   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
+   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
+   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
+   V2 = mm128_shufl2r_64( V[2], V[3] ); \
+   V3 = mm128_shufl2r_64( V[3], V[2] ); \
+   V6 = mm128_shufl2l_64( V[6], V[7] ); \
+   V7 = mm128_shufl2l_64( V[7], V[6] ); \
+   BLAKE2B_G( R, V[0], V2, V[5], V6,  8, 10, 32, 24 ); \
+   BLAKE2B_G( R, V[0], V2, V[5], V6,  9, 11, 16, 63 ); \
+   BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
+   BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
+   V[2] = mm128_shufl2l_64( V2, V3 ); \
+   V[3] = mm128_shufl2l_64( V3, V2 ); \
+   V[6] = mm128_shufl2r_64( V6, V7 ); \
+   V[7] = mm128_shufl2r_64( V7, V6 ); \
+}
+
+#else
+
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+
+#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
+{ \
+   Va = Va + Vb + m[ sigma[R][Sa] ]; \
+   Vd = ROTR64( Vd ^ Va, 32 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 24 ); \
+   Va = Va + Vb + m[ sigma[R][Sb] ]; \
+   Vd = ROTR64( Vd ^ Va, 16 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12],  0,  1 ); \
+   BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13],  2,  3 ); \
+   BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14],  4,  5 ); \
+   BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15],  6,  7 ); \
+   BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15],  8,  9 ); \
+   BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
+   BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
+   BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
+}
+
+#endif

 // Initialization Vector.

-static const uint64_t blake2b_iv[8] = {
+static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
+{
 	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };

+static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
+{
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
 // Compression function. "last" flag indicates last block.

 static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 {
-	const uint8_t sigma[12][16] = {
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-	};
-	int i;
-	uint64_t v[16], m[16];
+	uint64_t v[16] __attribute__ ((aligned (32)));
+   uint64_t m[16] __attribute__ ((aligned (32)));
+   int i;

 	for (i = 0; i < 8; i++) {           // init work variables
 		v[i] = ctx->h[i];
@@ -106,16 +186,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);

-	for (i = 0; i < 12; i++) {          // twelve rounds
-		B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-		B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-		B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-		B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-		B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-		B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-		B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-		B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-	}
+	for (i = 0; i < 12; i++)
+      BLAKE2B_ROUND( i );   

 	for( i = 0; i < 8; ++i )
 		ctx->h[i] ^= v[i] ^ v[i + 8];
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
        x7 = _mm512_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x1;
-        x0 = mm512_rol_32( x2, 7 );
-        x1 = mm512_rol_32( x3, 7 );
-        x2 = mm512_rol_32( y0, 7 );
-        x3 = mm512_rol_32( y1, 7 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        x1 = _mm512_xor_si512( x1, x5 );
+        y0 = mm512_rol_32( x2, 7 );
+        y1 = mm512_rol_32( x3, 7 );
+        x2 = mm512_rol_32( x0, 7 );
+        x3 = mm512_rol_32( x1, 7 );
+        x0 = _mm512_xor_si512( y0, x4 );
+        x1 = _mm512_xor_si512( y1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
        x4 = mm512_swap128_64( x4 );
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
        x7 = _mm512_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x2;
-        x0 = mm512_rol_32( x1, 11 );
-        x1 = mm512_rol_32( y0, 11 );
-        x2 = mm512_rol_32( x3, 11 );
-        x3 = mm512_rol_32( y1, 11 );
-        x0 = _mm512_xor_si512( x0, x4 );
+        y0 = mm512_rol_32( x1, 11 );
+        x1 = mm512_rol_32( x0, 11 );
+        y1 = mm512_rol_32( x3, 11 );
+        x3 = mm512_rol_32( x2, 11 );
+        x0 = _mm512_xor_si512( y0, x4 );
        x1 = _mm512_xor_si512( x1, x5 );
-        x2 = _mm512_xor_si512( x2, x6 );
+        x2 = _mm512_xor_si512( y1, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
        x4 = mm512_swap64_32( x4 );
        x5 = mm512_swap64_32( x5 );
@@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp )
    {
        x4 = _mm512_add_epi32( x0, x4 );
        y4 = _mm512_add_epi32( y0, y4 );
-        tx0 = x0;
-        ty0 = y0;
        x5 = _mm512_add_epi32( x1, x5 );
        y5 = _mm512_add_epi32( y1, y5 );
-        tx1 = x1;
-        ty1 = y1;
-        x0 = mm512_rol_32( x2, 7 );
-        y0 = mm512_rol_32( y2, 7 );
+        tx0 = mm512_rol_32( x2, 7 );
+        ty0 = mm512_rol_32( y2, 7 );
+        tx1 = mm512_rol_32( x3, 7 );
+        ty1 = mm512_rol_32( y3, 7 );
        x6 = _mm512_add_epi32( x2, x6 );
-        y6 = _mm512_add_epi32( y2, y6 );
-        x1 = mm512_rol_32( x3, 7 );
-        y1 = mm512_rol_32( y3, 7 );
+        y6 = _mm512_add_epi32( y2, y6 ); 
        x7 = _mm512_add_epi32( x3, x7 );
        y7 = _mm512_add_epi32( y3, y7 );
-
-
-        x2 = mm512_rol_32( tx0, 7 );
-        y2 = mm512_rol_32( ty0, 7 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        y0 = _mm512_xor_si512( y0, y4 );
+        x2 = mm512_rol_32( x0, 7 );
+        y2 = mm512_rol_32( y0, 7 );
+        x3 = mm512_rol_32( x1, 7 );
+        y3 = mm512_rol_32( y1, 7 );
+        x0 = _mm512_xor_si512( tx0, x4 );
+        y0 = _mm512_xor_si512( ty0, y4 );
+        x1 = _mm512_xor_si512( tx1, x5 );
+        y1 = _mm512_xor_si512( ty1, y5 );
        x4 = mm512_swap128_64( x4 );
-        x3 = mm512_rol_32( tx1, 7 );
-        y3 = mm512_rol_32( ty1, 7 );
        y4 = mm512_swap128_64( y4 );
-
-        x1 = _mm512_xor_si512( x1, x5 );
-        y1 = _mm512_xor_si512( y1, y5 );
        x5 = mm512_swap128_64( x5 );
+        y5 = mm512_swap128_64( y5 );
        x2 = _mm512_xor_si512( x2, x6 );
        y2 = _mm512_xor_si512( y2, y6 );
-        y5 = mm512_swap128_64( y5 );
        x3 = _mm512_xor_si512( x3, x7 );
        y3 = _mm512_xor_si512( y3, y7 );
-
        x6 = mm512_swap128_64( x6 );
+        y6 = mm512_swap128_64( y6 );
+        x7 = mm512_swap128_64( x7 );
+        y7 = mm512_swap128_64( y7 );
        x4 = _mm512_add_epi32( x0, x4 );
        y4 = _mm512_add_epi32( y0, y4 );
-        y6 = mm512_swap128_64( y6 );
        x5 = _mm512_add_epi32( x1, x5 );
        y5 = _mm512_add_epi32( y1, y5 );
-        x7 = mm512_swap128_64( x7 );
+        tx0 = mm512_rol_32( x1, 11 );
+        ty0 = mm512_rol_32( y1, 11 );
+        tx1 = mm512_rol_32( x3, 11 );
+        ty1 = mm512_rol_32( y3, 11 );
        x6 = _mm512_add_epi32( x2, x6 );
        y6 = _mm512_add_epi32( y2, y6 );
-        tx0 = x0;
-        ty0 = y0;
-        y7 = mm512_swap128_64( y7 );
-        tx1 = x2;
-        ty1 = y2;
-        x0 = mm512_rol_32( x1, 11 );
-        y0 = mm512_rol_32( y1, 11 );
-
        x7 = _mm512_add_epi32( x3, x7 );
        y7 = _mm512_add_epi32( y3, y7 );
-
-        x1 = mm512_rol_32( tx0, 11 );
-        y1 = mm512_rol_32( ty0, 11 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        x4 = mm512_swap64_32( x4 );
-        y0 = _mm512_xor_si512( y0, y4 );
-        x2 = mm512_rol_32( x3, 11 );
-        y4 = mm512_swap64_32( y4 );
-        y2 = mm512_rol_32( y3, 11 );
+        x1 = mm512_rol_32( x0, 11 );
+        y1 = mm512_rol_32( y0, 11 );
+        x3 = mm512_rol_32( x2, 11 );
+        y3 = mm512_rol_32( y2, 11 );
+        x0 = _mm512_xor_si512( tx0, x4 );
+        y0 = _mm512_xor_si512( ty0, y4 );
        x1 = _mm512_xor_si512( x1, x5 );
-        x5 = mm512_swap64_32( x5 );
        y1 = _mm512_xor_si512( y1, y5 );
-        x3 = mm512_rol_32( tx1, 11 );
+        x4 = mm512_swap64_32( x4 );
+        y4 = mm512_swap64_32( y4 );
+        x5 = mm512_swap64_32( x5 );
        y5 = mm512_swap64_32( y5 );
-        y3 = mm512_rol_32( ty1, 11 );
-
-        x2 = _mm512_xor_si512( x2, x6 );
-        x6 = mm512_swap64_32( x6 );
-        y2 = _mm512_xor_si512( y2, y6 );
-        y6 = mm512_swap64_32( y6 );
+        x2 = _mm512_xor_si512( tx1, x6 );
+        y2 = _mm512_xor_si512( ty1, y6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x7 = mm512_swap64_32( x7 );
        y3 = _mm512_xor_si512( y3, y7 );
-
+        x6 = mm512_swap64_32( x6 );
+        y6 = mm512_swap64_32( y6 );
+        x7 = mm512_swap64_32( x7 );
        y7 = mm512_swap64_32( y7 );
    }

@@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
    h[ 0] = m512_const1_128( iv[0] );
    h[ 1] = m512_const1_128( iv[1] );
    h[ 2] = m512_const1_128( iv[2] );
@@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x1;
-        ROL2( x0, x1, x2, x3, 7 );
-        ROL2( x2, x3, y0, y1, 7 );
-        x0 = _mm256_xor_si256( x0, x4 );
+        ROL2( y0, y1, x2, x3, 7 );
+        ROL2( x2, x3, x0, x1, 7 );
+        x0 = _mm256_xor_si256( y0, x4 );
+        x1 = _mm256_xor_si256( y1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
        x4 = mm256_swap128_64( x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
        x5 = mm256_swap128_64( x5 );
-        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = _mm256_add_epi32( x0, x4 );
        x6 = mm256_swap128_64( x6 );
-        y0 = x0;
-        x5 = _mm256_add_epi32( x1, x5 );
        x7 = mm256_swap128_64( x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
-        y1 = x2;
-        ROL2( x0, x1, x1, y0, 11 );
        x7 = _mm256_add_epi32( x3, x7 );
-        ROL2( x2, x3, x3, y1, 11 );
-        x0 = _mm256_xor_si256( x0, x4 );
-        x4 = mm256_swap64_32( x4 );
+        ROL2( y0, x1, x1, x0, 11 );
+        ROL2( y1, x3, x3, x2, 11 );
+        x0 = _mm256_xor_si256( y0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
-        x5 = mm256_swap64_32( x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x6 = mm256_swap64_32( x6 );
+        x2 = _mm256_xor_si256( y1, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
        x7 = mm256_swap64_32( x7 );
    }

@@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
    h[ 0] = m256_const1_128( iv[0] );
    h[ 1] = m256_const1_128( iv[1] );
    h[ 2] = m256_const1_128( iv[2] );
@@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
    return 0;
 }

-
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
 {
    const int len = size >> 4;
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -15,11 +15,11 @@

 struct _cubehashParam
 {
+    __m128i _ALIGN(64) x[8];  // aligned for __m512i
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(64) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -156,14 +156,12 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   }
   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;

   // --- update ---
   
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
   uint64_t blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;

@@ -175,8 +173,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   // copy any remaining data to buffer, it may already contain data
   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+       ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
+   // use i as rem_ptr in final

   //--- final ---

--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -227,12 +227,10 @@ int groestl256_full( hashState_groestl256* ctx,
  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
  INIT256( ctx->chaining );
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;

   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
   __m128i* in = (__m128i*)input;

@@ -245,7 +243,7 @@ int groestl256_full( hashState_groestl256* ctx,

   // cryptonight has 200 byte input, an odd number of __m128i
   // remainder is only 8 bytes, ie u64.
-   if ( databitlen % 128 !=0 )
+   if ( databitlen % 128 != 0 )
   {
      // must be cryptonight, copy 64 bits of data
      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
@@ -255,8 +253,8 @@ int groestl256_full( hashState_groestl256* ctx,
   {
      // Copy any remaining data to buffer for final transform
      for ( i = 0; i < len % SIZE256; i++ )
-          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-      i += rem;   // use i as rem_ptr in final
+          ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
+      // use i as rem_ptr in final
   }

   //--- final ---
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -50,7 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   const int len = (int)datalen >> 4;
   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
   uint64_t blocks = len / SIZE256;
   __m512i* in = (__m512i*)input;
   int i;
@@ -67,7 +66,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
   
   // --- update ---

@@ -76,11 +74,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
+   // copy any remaining data to buffer 
   for ( i = 0; i < len % SIZE256; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+       ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
+   // use i as rem_ptr in final

   //--- final ---

@@ -206,7 +203,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   const int len = (int)datalen >> 4;
   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
   uint64_t blocks = len / SIZE256;
   __m256i* in = (__m256i*)input;
   int i;
@@ -223,7 +219,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   // The only non-zero in the IV is len. It can be hard coded.
   ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;

   // --- update ---

@@ -232,11 +227,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
      TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
+   // copy any remaining data to buffer
   for ( i = 0; i < len % SIZE256; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+       ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
+   // use i as rem_ptr in final

   //--- final ---

--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -99,7 +99,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
   memset_zero_512( ctx->buffer, SIZE512 );
   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;

   // --- update ---

@@ -108,8 +107,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
   ctx->buf_ptr = blocks * SIZE512;

   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
-   i += ctx->rem_ptr;
+       ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];

   // --- close ---

@@ -222,7 +220,6 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
   memset_zero_256( ctx->buffer, SIZE512 );
   ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;

   // --- update ---

@@ -231,8 +228,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
   ctx->buf_ptr = blocks * SIZE512;

   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
-   i += ctx->rem_ptr;
+       ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];

   // --- close ---

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -13,8 +13,7 @@

 #if defined (ALLIUM_16WAY)  

-typedef struct {
-   blake256_16way_context     blake;
+typedef union {
   keccak256_8way_context    keccak;
   cube_4way_2buf_context    cube;
   skein256_8way_context     skein;
@@ -25,41 +24,31 @@ typedef struct {
 #endif
 } allium_16way_ctx_holder;

-static __thread allium_16way_ctx_holder allium_16way_ctx;
-
-bool init_allium_16way_ctx()
-{
-   keccak256_8way_init( &allium_16way_ctx.keccak );
-   skein256_8way_init( &allium_16way_ctx.skein );
-   return true;
-}
-
-void allium_16way_hash( void *state, const void *input )
+static void allium_16way_hash( void *state, const void *midstate_vars, 
+                               const void *midhash, const void *block )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   uint32_t hash8[8] __attribute__ ((aligned (64)));
-   uint32_t hash9[8] __attribute__ ((aligned (64)));
-   uint32_t hash10[8] __attribute__ ((aligned (64)));
-   uint32_t hash11[8] __attribute__ ((aligned (64)));
-   uint32_t hash12[8] __attribute__ ((aligned (64)));
-   uint32_t hash13[8] __attribute__ ((aligned (64)));
-   uint32_t hash14[8] __attribute__ ((aligned (64)));
-   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (32)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   uint32_t hash8[8] __attribute__ ((aligned (32)));
+   uint32_t hash9[8] __attribute__ ((aligned (32)));
+   uint32_t hash10[8] __attribute__ ((aligned (32)));
+   uint32_t hash11[8] __attribute__ ((aligned (32)));
+   uint32_t hash12[8] __attribute__ ((aligned (32)));
+   uint32_t hash13[8] __attribute__ ((aligned (32)));
+   uint32_t hash14[8] __attribute__ ((aligned (32)));
+   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -69,6 +58,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
+   keccak256_8way_init( &ctx.keccak );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
   keccak256_8way_close( &ctx.keccak, vhashA);
   keccak256_8way_init( &ctx.keccak );
@@ -151,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

+   skein256_8way_init( &ctx.skein );
   skein256_8way_update( &ctx.skein, vhashA, 32 );
   skein256_8way_close( &ctx.skein, vhashA );
   skein256_8way_init( &ctx.skein );
@@ -198,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
   groestl256_full( &ctx.groestl, state+416, hash13, 256 );
   groestl256_full( &ctx.groestl, state+448, hash14, 256 );
   groestl256_full( &ctx.groestl, state+480, hash15, 256 );
+
 #endif
 }

@@ -205,35 +197,72 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) = 
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m512i sixteen = m512_const1_32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   // Prehash first block.
+   blake256_transform_le( phash, pdata, 512, 0 );

-   blake256_16way_init( &allium_16way_ctx.blake );
-   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
+   // Interleave hash for second block prehash.
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces, add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] = 
+   block_buf[ 7] = 
+   block_buf[ 8] = 
+   block_buf[ 9] = 
+   block_buf[10] = 
+   block_buf[11] = 
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_16way_hash( hash, vdata );
+     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 16; lane++ ) 
     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
     {
-         pdata[19] = bswap_32( n + lane );
-         submit_solution( work, hash+(lane<<3), mythr );
+        pdata[19] = n + lane;
+        submit_solution( work, hash+(lane<<3), mythr );
     }
-     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen ); 
     n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
@@ -243,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,

 #elif defined (ALLIUM_8WAY)  

-typedef struct {
-   blake256_8way_context     blake;
+typedef union {
   keccak256_4way_context    keccak;
   cube_2way_context         cube;
   skein256_4way_context     skein;
@@ -255,19 +283,11 @@ typedef struct {
 #endif
 } allium_8way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
-
-bool init_allium_8way_ctx()
-{
-   keccak256_4way_init( &allium_8way_ctx.keccak );
-   skein256_4way_init( &allium_8way_ctx.skein );
-   return true;
-}
-
-void allium_8way_hash( void *hash, const void *input )
+static void allium_8way_hash( void *hash, const void *midstate_vars,
+                               const void *midhash, const void *block )
 {
   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
   uint64_t *hash0 = (uint64_t*)hash;
   uint64_t *hash1 = (uint64_t*)hash+ 4;
   uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -278,15 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhashA );
+   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   keccak256_4way_init( &ctx.keccak );
   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
   keccak256_4way_close( &ctx.keccak, vhashA );
   keccak256_4way_init( &ctx.keccak );
@@ -305,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

-
   intrlv_2x128( vhashA, hash0, hash1, 256 );
   intrlv_2x128( vhashB, hash2, hash3, 256 );
   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
@@ -332,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   skein256_4way_init( &ctx.skein );
   skein256_4way_update( &ctx.skein, vhashA, 32 );
   skein256_4way_close( &ctx.skein, vhashA );
   skein256_4way_init( &ctx.skein );
@@ -340,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )

 #if defined(__VAES__)

-   uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
-   uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
+   uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
+   uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
   
   rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -376,36 +395,72 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;  
   const bool bench = opt_benchmark;
+   const __m256i eight = m256_const1_32( 8 );

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   blake256_8way_init( &allium_8way_ctx.blake );
-   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_8way_hash( hash, vdata );
+     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 8; lane++ )
     {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = bswap_32( n + lane );
+           pdata[19] = n + lane;
           submit_solution( work, lane_hash, mythr );
        }
     }
     n += 8;
-     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
-  gate->hash       = (void*)&lyra2z_16way_hash;
+//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
-  gate->hash       = (void*)&lyra2z_8way_hash;
+//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
-  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_16way;
-  gate->hash      = (void*)&allium_16way_hash;
 #elif defined (ALLIUM_8WAY)
-  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
-  gate->hash      = (void*)&allium_8way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();

 #if defined(LYRA2Z_16WAY)

-void lyra2z_16way_hash( void *state, const void *input );
+//void lyra2z_16way_hash( void *state, const void *input );
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_16way_thread_init();

 #elif defined(LYRA2Z_8WAY)

-void lyra2z_8way_hash( void *state, const void *input );
+//void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );

 #if defined(ALLIUM_16WAY)

-void allium_16way_hash( void *state, const void *input );
 int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_16way_ctx();

 #elif defined(ALLIUM_8WAY)

-void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_8way_ctx();

 #else

--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -14,38 +14,28 @@ bool lyra2z_16way_thread_init()
 return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_16way_context l2z_16way_blake_mid;
-
-void lyra2z_16way_midstate( const void* input )
-{
-       blake256_16way_init( &l2z_16way_blake_mid );
-       blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
-}
-
-void lyra2z_16way_hash( void *state, const void *input )
+static void lyra2z_16way_hash( void *state, const void *midstate_vars,
+                        const void *midhash, const void *block )
 {
    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
-    uint32_t hash0[8] __attribute__ ((aligned (64)));
-    uint32_t hash1[8] __attribute__ ((aligned (64)));
-    uint32_t hash2[8] __attribute__ ((aligned (64)));
-    uint32_t hash3[8] __attribute__ ((aligned (64)));
-    uint32_t hash4[8] __attribute__ ((aligned (64)));
-    uint32_t hash5[8] __attribute__ ((aligned (64)));
-    uint32_t hash6[8] __attribute__ ((aligned (64)));
-    uint32_t hash7[8] __attribute__ ((aligned (64)));
-    uint32_t hash8[8] __attribute__ ((aligned (64)));
-    uint32_t hash9[8] __attribute__ ((aligned (64)));
-    uint32_t hash10[8] __attribute__ ((aligned (64)));
-    uint32_t hash11[8] __attribute__ ((aligned (64)));
-    uint32_t hash12[8] __attribute__ ((aligned (64)));
-    uint32_t hash13[8] __attribute__ ((aligned (64)));
-    uint32_t hash14[8] __attribute__ ((aligned (64)));
-    uint32_t hash15[8] __attribute__ ((aligned (64)));
-    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+    uint32_t hash0[8] __attribute__ ((aligned (32)));
+    uint32_t hash1[8] __attribute__ ((aligned (32)));
+    uint32_t hash2[8] __attribute__ ((aligned (32)));
+    uint32_t hash3[8] __attribute__ ((aligned (32)));
+    uint32_t hash4[8] __attribute__ ((aligned (32)));
+    uint32_t hash5[8] __attribute__ ((aligned (32)));
+    uint32_t hash6[8] __attribute__ ((aligned (32)));
+    uint32_t hash7[8] __attribute__ ((aligned (32)));
+    uint32_t hash8[8] __attribute__ ((aligned (32)));
+    uint32_t hash9[8] __attribute__ ((aligned (32)));
+    uint32_t hash10[8] __attribute__ ((aligned (32)));
+    uint32_t hash11[8] __attribute__ ((aligned (32)));
+    uint32_t hash12[8] __attribute__ ((aligned (32)));
+    uint32_t hash13[8] __attribute__ ((aligned (32)));
+    uint32_t hash14[8] __attribute__ ((aligned (32)));
+    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
-    blake256_16way_update( &ctx_blake, input + (64*16), 16 );
-    blake256_16way_close( &ctx_blake, vhash );
+    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -97,40 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m512i sixteen = m512_const1_32( 16 );

-   if ( bench )   ptarget[7] = 0x0000ff;
+   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );
+
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   lyra2z_16way_midstate( vdata );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_16way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
-      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );

+     for ( int lane = 0; lane < 16; lane++ )
+     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
+     {
+        pdata[19] = n + lane;
+        submit_solution( work, hash+(lane<<3), mythr );
+     }
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
+     n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
@@ -145,30 +169,20 @@ bool lyra2z_8way_thread_init()
 return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_8way_context l2z_8way_blake_mid;
-
-void lyra2z_8way_midstate( const void* input )
-{
-       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
+static void lyra2z_8way_hash( void *state, const void *midstate_vars,
+                       const void *midhash, const void *block )
 {
     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     uint32_t hash4[8] __attribute__ ((aligned (32)));
+     uint32_t hash5[8] __attribute__ ((aligned (32)));
+     uint32_t hash6[8] __attribute__ ((aligned (32)));
+     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));

-     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
-     blake256_8way_close( &ctx_blake, vhash );
+     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -182,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );

-
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -197,43 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m256i eight = m256_const1_32( 8 );

-   if ( bench )  ptarget[7] = 0x0000ff;
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   lyra2z_8way_midstate( vdata );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_8way_hash( hash, vdata );
+     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );

-      for ( int lane = 0; lane < 8; lane++ )
-      {
+     for ( int lane = 0; lane < 8; lane++ )
+     {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = bswap_32( n + lane );
+           pdata[19] = n + lane;
           submit_solution( work, lane_hash, mythr );
        }
-      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+     }
+     n += 8;
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

-
 #elif defined(LYRA2Z_4WAY)


--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   G_2X64( s1, s3, s5, s7 ); \
   mm128_vrol256_64( s6, s7 ); \
   mm128_vror256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_2X64( s0, s2, s5, s6 ); \
+   G_2X64( s1, s3, s4, s7 ); \
   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 );
+   mm128_vrol256_64( s2, s3 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash4 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash5 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash6 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash7 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash4 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash5 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash6 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash7 [16]    __attribute__ ((aligned (32)));
   hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
   __mmask8 vh_mask;
   const __m512i vmask = m512_const1_64( 24 );
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;

 extern void hmq1725_4way_hash(void *state, const void *input)
 {
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
   uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
   hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
   __m256i vh_mask;     
   int h_mask;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -16,7 +16,8 @@

 #if defined (X16R_8WAY)

-// Perform midstate prehash of hash functions with block size <= 72 bytes.
+// Perform midstate prehash of hash functions with block size <= 72 bytes,
+// 76 bytes for hash functions that operate on 32 bit data.

 void x16r_8way_prehash( void *vdata, void *pdata )
 {
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x16r_ctx.luffa, 512 );
-         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                  ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
+         intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                  ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
+                                        ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
 int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t hash4[20] __attribute__ ((aligned (64)));
-   uint32_t hash5[20] __attribute__ ((aligned (64)));
-   uint32_t hash6[20] __attribute__ ((aligned (64)));
-   uint32_t hash7[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (16)));
+   uint32_t hash1[20] __attribute__ ((aligned (16)));
+   uint32_t hash2[20] __attribute__ ((aligned (16)));
+   uint32_t hash3[20] __attribute__ ((aligned (16)));
+   uint32_t hash4[20] __attribute__ ((aligned (16)));
+   uint32_t hash5[20] __attribute__ ((aligned (16)));
+   uint32_t hash6[20] __attribute__ ((aligned (16)));
+   uint32_t hash7[20] __attribute__ ((aligned (16)));
   x16r_8way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
      s_ntime = ntime;

      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_8way_prehash( vdata, pdata );
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         luffa_2way_init( &x16r_ctx.luffa, 512 );
-         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
-         break;
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                                              ctx_luffa.buffer, 512 );
+         intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                                              ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
+      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
 int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16r_4way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_4way_prehash( vdata, pdata );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -30,8 +30,8 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

@@ -84,8 +84,8 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
 int x16rv2_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t hash0[24] __attribute__ ((aligned (32)));
+   uint32_t hash1[24] __attribute__ ((aligned (32)));
+   uint32_t hash2[24] __attribute__ ((aligned (32)));
+   uint32_t hash3[24] __attribute__ ((aligned (32)));
+   uint32_t hash4[24] __attribute__ ((aligned (32)));
+   uint32_t hash5[24] __attribute__ ((aligned (32)));
+   uint32_t hash6[24] __attribute__ ((aligned (32)));
+   uint32_t hash7[24] __attribute__ ((aligned (32)));
   x16rv2_8way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )

 int x16rv2_4way_hash( void* output, const void* input, int thrid )
 {
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16rv2_4way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;
   
-
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     sonoa_8way_context_overlay ctx;

 // 1
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;

 int sonoa_4way_hash( void *state, const void *input, int thr_id )
 {
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     sonoa_4way_context_overlay ctx;

 // 1
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -58,23 +58,27 @@ union _x17_8way_context_overlay
 } __attribute__ ((aligned (64)));
 typedef union _x17_8way_context_overlay x17_8way_context_overlay;

+static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+
 int x17_8way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     x17_8way_context_overlay ctx;

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
-
+     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                             x17_8way_midstate );
+     
     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)
@@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )

     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
     
-//     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-//     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
-
 #if defined(__VAES__)

     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
@@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
     return 1;
 }

+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 = &(hash32[7*8]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m512i eight = m512_const1_64( 8 );
+   const bool bench = opt_benchmark;
+
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
+
+   mm512_intrlv80_8x64( vdata, edata );
+
+   *noncev = mm512_intrlv_blend_32( *noncev,
+                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
+                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
+   
+   do
+   {
+      if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] =  n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+
 #elif defined(X17_4WAY)

 union _x17_4way_context_overlay
@@ -271,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

     blake512_4way_full( &ctx.blake, vhash, input, 80 );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -3,7 +3,7 @@
 bool register_x17_algo( algo_gate_t* gate )
 {
 #if defined (X17_8WAY)
-  gate->scanhash  = (void*)&scanhash_8way_64in_32out;
+  gate->scanhash  = (void*)&scanhash_x17_8way;
  gate->hash      = (void*)&x17_8way_hash;
 #elif defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate );

 #if defined(X17_8WAY)

+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
 int x17_8way_hash( void *state, const void *input, int thr_id );

 #elif defined(X17_4WAY)

+int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
 int x17_4way_hash( void *state, const void *input, int thr_id );

 #endif
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
-     uint64_t hash4[16] __attribute__ ((aligned (64)));
-     uint64_t hash5[16] __attribute__ ((aligned (64)));
-     uint64_t hash6[16] __attribute__ ((aligned (64)));
-     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
+     uint64_t hash4[16] __attribute__ ((aligned (32)));
+     uint64_t hash5[16] __attribute__ ((aligned (32)));
+     uint64_t hash6[16] __attribute__ ((aligned (32)));
+     uint64_t hash7[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

 int xevan_4way_hash( void *output, const void *input, int thr_id )
 {
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -21,7 +21,6 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/swifftx/swifftx.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
                        AVX512_OPT | VAES_OPT;
+  InitializeSWIFFTX();
  return true;
 };

--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -5,6 +5,7 @@
 #include "simd-utils.h"
 #include <stdint.h>
 #include <unistd.h>
+#include "algo/swifftx/swifftx.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X22I_8WAY 1
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -24,7 +24,6 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/swifftx/swifftx.h"
 #include "algo/panama/panama-hash-4way.h"
 #include "algo/lanehash/lane.h"
 #if defined(__VAES__)
@@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay
 };
 typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;

+static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+
 int x25x_8way_hash( void *output, const void *input, int thrid )
 {
   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
   x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_8way_init( &ctx.blake );
-   blake512_8way_update( &ctx.blake, input, 80 );
-   blake512_8way_close( &ctx.blake, vhash );
+   blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                                                x25x_8way_midstate );
+
   dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
                     hash4[0], hash5[0], hash6[0], hash7[0], vhash );

@@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
                           hash4[10], hash5[10], hash6[10], hash7[10] );

-   
 #else

   init_echo( &ctx.echo, 512 );
@@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hashd7 = &(hash[7*8]);
   uint32_t *pdata = work->data;
@@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
   const bool bench = opt_benchmark;
-
+   const __m512i eight = m512_const1_64( 8 );
   if ( bench )  ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); 
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );   
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );   
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );   
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );   
+
+   mm512_intrlv80_8x64( vdata, edata );
+
+   *noncev = mm512_intrlv_blend_32( *noncev,
+                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
+                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); 

-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
      if ( x25x_8way_hash( hash, vdata, thr_id ) );
@@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+      *noncev = _mm512_add_epi32( *noncev, eight );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay
    panama_4way_context     panama;
    blake2s_4way_state      blake2s;
 };
+
 typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;

+static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+
 int x25x_4way_hash( void *output, const void *input, int thrid )
 {
   uint64_t vhash[8*4] __attribute__ ((aligned (128)));
@@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_4way_full( &ctx.blake, vhash, input, 80 );
+   blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                                                x25x_4way_midstate );
+   
   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

   bmw512_4way_init( &ctx.bmw );
@@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
   uint32_t *hashd7 = &(hash[ 7*4 ]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

   if ( bench ) ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32(
-                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   mm256_intrlv80_4x64( vdata, edata );
+
+   *noncev = mm256_intrlv_blend_32( *noncev,
+                           _mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
+   
   do
   {
      if ( x25x_4way_hash( hash, vdata, thr_id ) )
@@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( valid_hash( lane_hash, ptarget ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+      *noncev = _mm256_add_epi32( *noncev, four );
      n += 4;
   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo/yespower/crypto/blake2b-yp.c
+++ b/algo/yespower/crypto/blake2b-yp.c
@@ -1,323 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2014 savale
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *  notice, this list of conditions and the following disclaimer in the
- *  documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include "simd-utils.h"
-#include <algo/yespower/crypto/sph_types.h>
-#include "blake2b-yp.h"
-
-// Cyclic right rotation.
-//#ifndef ROTR64
-//#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-//#endif
-
-#define ROTR64(x, y) ror64( x, y )
-
-// Little-endian byte access.
-#define B2B_GET64(p)                            \
-    (((uint64_t) ((uint8_t *) (p))[0]) ^        \
-    (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
-    (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
-    (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
-    (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
-    (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
-    (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
-    (((uint64_t) ((uint8_t *) (p))[7]) << 56))
-
-// G Mixing function.
-#define B2B_G(a, b, c, d, x, y) {   \
-    v[a] = v[a] + v[b] + x;      \
-    v[d] = ROTR64(v[d] ^ v[a], 32); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 24); \
-    v[a] = v[a] + v[b] + y;      \
-    v[d] = ROTR64(v[d] ^ v[a], 16); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 63); }
-
-// Initialization Vector.
-static const uint64_t blake2b_iv[8] = {
-    0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
-    0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
-    0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
-    0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
-};
-
-// Compression function. "last" flag indicates last block.
-static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
-{
-    const uint8_t sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-    };
-    int i;
-    uint64_t v[16], m[16];
-
-    // init work variables
-    for (i = 0; i < 8; i++) {
-        v[i] = ctx->h[i];
-        v[i + 8] = blake2b_iv[i];
-    }
-
-    v[12] ^= ctx->t[0]; // low 64 bits of offset
-    v[13] ^= ctx->t[1]; // high 64 bits
-
-    // last block flag set ?
-    if (last) { 
-        v[14] = ~v[14];
-    }
-
-    // get little-endian words
-    for (i = 0; i < 16; i++) {
-        m[i] = B2B_GET64(&ctx->b[8 * i]);
-    }
-
-    // twelve rounds
-    for (i = 0; i < 12; i++) {
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-    }
-
-    for(i = 0; i < 8; ++i) {
-        ctx->h[i] ^= v[i] ^ v[i + 8];
-    }
-}
-
-// Initialize the hashing context "ctx" with optional key "key".
-// 1 <= outlen <= 64 gives the digest size in bytes.
-// Secret key (also <= 64 bytes) is optional (keylen = 0).
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
-    const void *key, size_t keylen) // (keylen=0: no key)
-{
-    size_t i;
-
-    // illegal parameters
-    if (outlen == 0 || outlen > 64 || keylen > 64) {
-        return -1;
-    }
-
-    // state, "param block"
-    for (i = 0; i < 8; i++) {
-        ctx->h[i] = blake2b_iv[i];
-    }
-
-    ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
-
-    ctx->t[0] = 0; // input count low word
-    ctx->t[1] = 0; // input count high word
-    ctx->c = 0; // pointer within buffer
-    ctx->outlen = outlen;
-
-    // zero input block
-    for (i = keylen; i < 128; i++) {
-        ctx->b[i] = 0;
-    }
-
-    if (keylen > 0) {
-        blake2b_yp_update(ctx, key, keylen);
-        ctx->c = 128; // at the end
-    }
-
-    return 0;
-}
-
-// Add "inlen" bytes from "in" into the hash.
-void blake2b_yp_update(blake2b_yp_ctx *ctx,
-    const void *in, size_t inlen) // data bytes
-{
-    size_t i;
-    for (i = 0; i < inlen; i++) {
-        if (ctx->c == 128) { // buffer full ?
-            ctx->t[0] += ctx->c; // add counters
-            if (ctx->t[0] < ctx->c) // carry overflow ?
-                ctx->t[1]++; // high word
-            blake2b_compress(ctx, 0); // compress (not last)
-            ctx->c = 0; // counter to zero
-        }
-        ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
-    }
-}
-
-// Generate the message digest (size given in init).
-// Result placed in "out".
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
-{
-    size_t i;
-
-    ctx->t[0] += ctx->c; // mark last block offset
-    // carry overflow
-    if (ctx->t[0] < ctx->c) {
-        ctx->t[1]++; // high word
-    }
-
-    // fill up with zeros
-    while (ctx->c < 128) {
-        ctx->b[ctx->c++] = 0;
-    }
-
-    blake2b_compress(ctx, 1); // final block flag = 1
-
-    // little endian convert and store
-    for (i = 0; i < ctx->outlen; i++) {
-        ((uint8_t *) out)[i] =
-            (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
-    }
-}
-
-// inlen = number of bytes
-void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
-    blake2b_yp_ctx ctx;
-    blake2b_yp_init(&ctx, 32, NULL, 0);
-    blake2b_yp_update(&ctx, in, inlen);
-    blake2b_yp_final(&ctx, out);
-}
-
-// // keylen = number of bytes
-void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
-    const uint8_t *key = _key;
-    uint8_t keyhash[32];
-    uint8_t pad[64];
-    uint64_t i;
-
-    if (keylen > 64) {
-        blake2b_yp_hash(keyhash, key, keylen);
-        key = keyhash;
-        keylen = 32;
-    }
-
-    blake2b_yp_init(&hctx->inner, 32, NULL, 0);
-    memset(pad, 0x36, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->inner, pad, 64);
-    blake2b_yp_init(&hctx->outer, 32, NULL, 0);
-    memset(pad, 0x5c, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->outer, pad, 64);
-    memset(keyhash, 0, 32);
-}
-
-// datalen = number of bits
-void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
-    // update the inner state
-    blake2b_yp_update(&hctx->inner, data, datalen);
-}
-
-void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
-    uint8_t ihash[32];
-    blake2b_yp_final(&hctx->inner, ihash);
-    blake2b_yp_update(&hctx->outer, ihash, 32);
-    blake2b_yp_final(&hctx->outer, digest);
-    memset(ihash, 0, 32);
-}
-
-// // keylen = number of bytes; inlen = number of bytes
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
-    hmac_yp_ctx hctx;
-    hmac_blake2b_yp_init(&hctx, key, keylen);
-    hmac_blake2b_yp_update(&hctx, in, inlen);
-    hmac_blake2b_yp_final(&hctx, out);
-}
-
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-    hmac_yp_ctx PShctx, hctx;
-    size_t i;
-    uint32_t ivec;
-    uint8_t U[32];
-    uint8_t T[32];
-    uint64_t j;
-    int k;
-    size_t clen;
-
-    /* Compute HMAC state after processing P and S. */
-    hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
-    hmac_blake2b_yp_update(&PShctx, salt, saltlen);
-
-    /* Iterate through the blocks. */
-    for (i = 0; i * 32 < dkLen; i++) {
-        /* Generate INT(i + 1). */
-        ivec = bswap_32( i+1 );
-
-        /* Compute U_1 = PRF(P, S || INT(i)). */
-        memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
-        hmac_blake2b_yp_update(&hctx, &ivec, 4);
-        hmac_blake2b_yp_final(&hctx, U);
-
-        /* T_i = U_1 ... */
-        memcpy(T, U, 32);
-
-        for (j = 2; j <= c; j++) {
-            /* Compute U_j. */
-            hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
-            hmac_blake2b_yp_update(&hctx, U, 32);
-            hmac_blake2b_yp_final(&hctx, U);
-
-            /* ... xor U_j ... */
-            for (k = 0; k < 32; k++) {
-                T[k] ^= U[k];
-            }
-        }
-
-        /* Copy as many bytes as necessary into buf. */
-        clen = dkLen - i * 32;
-        if (clen > 32) {
-            clen = 32;
-        }
-
-        memcpy(&buf[i * 32], T, clen);
-    }
-
-    /* Clean PShctx, since we never called _Final on it. */
-    memset(&PShctx, 0, sizeof(hmac_yp_ctx));
-}
--- a/algo/yespower/crypto/blake2b-yp.h
+++ b/algo/yespower/crypto/blake2b-yp.h
@@ -1,42 +0,0 @@
-#pragma once
-#ifndef __BLAKE2B_H__
-#define __BLAKE2B_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
-#define NATIVE_LITTLE_ENDIAN
-#endif
-
-// state context
-typedef struct {
-    uint8_t b[128]; // input buffer
-    uint64_t h[8];  // chained state
-    uint64_t t[2];  // total number of bytes
-    size_t c;       // pointer for b[]
-    size_t outlen;  // digest size
-} blake2b_yp_ctx;
-
-typedef struct {
-    blake2b_yp_ctx inner;
-    blake2b_yp_ctx outer;
-} hmac_yp_ctx;
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
-void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
-void blake2b_yp_hash(void *out, const void *in, size_t inlen);
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/algo/yespower/crypto/hmac-blake2b.c
+++ b/algo/yespower/crypto/hmac-blake2b.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "simd-utils.h"
+#include "hmac-blake2b.h"
+
+// keylen = number of bytes
+void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
+                        size_t keylen )
+{
+    const uint8_t *key = _key;
+    uint8_t keyhash[32];
+    uint8_t pad[64];
+    uint64_t i;
+
+    if (keylen > 64)
+    {
+       sph_blake2b_ctx ctx;
+       sph_blake2b_init( &ctx, 32, NULL, 0 );
+       sph_blake2b_update( &ctx, key, keylen );
+       sph_blake2b_final( &ctx, keyhash );
+       key = keyhash;
+       keylen = 32;
+    }
+
+    sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
+    memset( pad, 0x36, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->inner, pad, 64 );
+    sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
+    memset( pad, 0x5c, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->outer, pad, 64 );
+    memset( keyhash, 0, 32 );
+}
+
+// datalen = number of bits
+void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
+                          size_t datalen )
+{
+    // update the inner state
+    sph_blake2b_update( &hctx->inner, data, datalen );
+}
+
+void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
+{
+    uint8_t ihash[32];
+    sph_blake2b_final( &hctx->inner, ihash );
+    sph_blake2b_update( &hctx->outer, ihash, 32 );
+    sph_blake2b_final( &hctx->outer, digest );
+    memset( ihash, 0, 32 );
+}
+
+// // keylen = number of bytes; inlen = number of bytes
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen )
+{
+    hmac_blake2b_ctx hctx;
+    hmac_blake2b_init( &hctx, key, keylen );
+    hmac_blake2b_update( &hctx, in, inlen );
+    hmac_blake2b_final( &hctx, out );
+}
+
+void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
+                     const uint8_t *salt, size_t saltlen, uint64_t c,
+                     uint8_t *buf, size_t dkLen )
+{
+    hmac_blake2b_ctx PShctx, hctx;
+    size_t i;
+    uint32_t ivec;
+    uint8_t U[32];
+    uint8_t T[32];
+    uint64_t j;
+    int k;
+    size_t clen;
+
+    /* Compute HMAC state after processing P and S. */
+    hmac_blake2b_init( &PShctx, passwd, passwdlen );
+    hmac_blake2b_update( &PShctx, salt, saltlen );
+
+    /* Iterate through the blocks. */
+    for ( i = 0; i * 32 < dkLen; i++ )
+    {
+        /* Generate INT(i + 1). */
+        ivec = bswap_32( i+1 );
+
+        /* Compute U_1 = PRF(P, S || INT(i)). */
+        memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
+        hmac_blake2b_update( &hctx, &ivec, 4 );
+        hmac_blake2b_final( &hctx, U );
+
+        /* T_i = U_1 ... */
+        memcpy( T, U, 32 );
+
+        for ( j = 2; j <= c; j++ )
+        {
+            /* Compute U_j. */
+            hmac_blake2b_init( &hctx, passwd, passwdlen );
+            hmac_blake2b_update( &hctx, U, 32 );
+            hmac_blake2b_final( &hctx, U );
+
+            /* ... xor U_j ... */
+            for ( k = 0; k < 32; k++ )
+                T[k] ^= U[k];
+        }
+
+        /* Copy as many bytes as necessary into buf. */
+        clen = dkLen - i * 32;
+        if (clen > 32)
+            clen = 32;
+
+        memcpy( &buf[i * 32], T, clen );
+    }
+
+    /* Clean PShctx, since we never called _Final on it. */
+    memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
+}
--- a/algo/yespower/crypto/hmac-blake2b.h
+++ b/algo/yespower/crypto/hmac-blake2b.h
@@ -0,0 +1,34 @@
+#pragma once
+#ifndef __HMAC_BLAKE2B_H__
+#define __HMAC_BLAKE2B_H__
+
+#include <stddef.h>
+#include <stdint.h>
+#include "algo/blake/sph_blake2b.h"
+
+#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+
+typedef struct
+{
+    sph_blake2b_ctx inner;
+    sph_blake2b_ctx outer;
+} hmac_blake2b_ctx;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen );
+
+void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
+                     const uint8_t * salt, size_t saltlen, uint64_t c,
+                     uint8_t * buf, size_t dkLen );
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/yespower/crypto/sph_types.h
+++ b/algo/yespower/crypto/sph_types.h
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -95,7 +95,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include "crypto/blake2b-yp.h"
+#include "crypto/hmac-blake2b.h"
 #include "yespower.h"

 #ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
    salsa20_blk_t *V, *XY;
    pwxform_ctx_t ctx;
    uint8_t init_hash[32];
+    sph_blake2b_ctx blake2b_ctx;

    /* Sanity-check parameters */
    if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
    ctx.S0 = S;
    ctx.S1 = S + Swidth_to_Sbytes1(Swidth);

-    blake2b_yp_hash(init_hash, src, srclen);
+    sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
+    sph_blake2b_update( &blake2b_ctx, src, srclen );
+    sph_blake2b_final( &blake2b_ctx, init_hash );

    ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
    ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,

    if ( work_restart[thrid].restart ) return false;
    
-    pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
+    pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);

    if ( work_restart[thrid].restart ) return false;

@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,

    if ( work_restart[thrid].restart ) return false;

-    hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
+    hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));

    /* Success! */
    return 1;
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
  applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
  applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );

-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_b2b;
  gate->hash          = (void*)&yespower_b2b_hash;
  opt_target_factor = 65536.0;
--- a/48
+++ b/48
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.7'
-PACKAGE_STRING='cpuminer-opt 3.19.7'
+PACKAGE_VERSION='3.20.1'
+PACKAGE_STRING='cpuminer-opt 3.20.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.19.7
+cpuminer-opt configure 3.20.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.19.7, which was
+It was created by cpuminer-opt $as_me 3.20.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.19.7'
+ VERSION='3.20.1'


 cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h

      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
+$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_AVX512 1" >>confdefs.h
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext

 else
  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.7, which was
+This file was extended by cpuminer-opt $as_me 3.20.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.7
+cpuminer-opt config.status 3.20.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.7])
+AC_INIT([cpuminer-opt], [3.20.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
      AC_MSG_RESULT(yes)
+      AC_MSG_CHECKING(whether we can compile AVX512 code)
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
+        AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
+        AC_MSG_RESULT(yes)
+      ,
+        AC_MSG_RESULT(no)
+        AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
+      )
    ,
      AC_MSG_RESULT(no)
      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1099,7 +1099,7 @@ void report_summary_log( bool force )
   sprintf_et( et_str, et.tv_sec );
   sprintf_et( upt_str, uptime.tv_sec );

-   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
+   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url );
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
@@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work,
           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
           bres, CL_N, share_time, latency );

+/*   
   if ( unlikely( opt_debug || !result || solved ) )
   {
      if ( have_stratum )
@@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work,
         applog2( LOG_INFO, "Diff %.5g, Block %d",
               my_stats.share_diff, work ? work->height : last_block_height );
   }
+*/

   if ( unlikely( !( opt_quiet || result || stale ) ) )
   {
-      uint32_t str[8];
-      uint32_t *targ;
+//      uint32_t str[8];
+//      uint32_t *targ;

-      if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
-         
+      if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
+      {
+         // The exact hash is not avaiable here, it's just an imprecise
+         // approximation calculated from the share difficulty. It's useless
+         // for anything other than low diff rejects. Until and unless a
+         // solution is implemented to make the hash and targets avaiable
+         // don't bother displaying them. In the meantime display the diff for
+         // low diff rejects.
+
+         if ( strstr( reason, "difficulty" ) )
+            applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
+                               my_stats.share_diff, my_stats.target_diff );
+
+/*
      diff_to_hash( str, my_stats.share_diff );
      applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
               str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work,
      }
      applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
               targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
+*/
+      }
   }
   return 1;
 }
@@ -2754,7 +2770,7 @@ static void *stratum_thread(void *userdata )
   stratum.url = (char*) tq_pop(mythr->q, NULL);
   if (!stratum.url)
      goto out;
-   applog( LOG_BLUE, "Stratum connect %s", short_url );
+   applog( LOG_BLUE, "Stratum connect %s", stratum.url );

   while (1)
   {
@@ -3335,6 +3351,7 @@ void parse_arg(int key, char *arg )
 			if ( strncasecmp( arg, "http://", 7 )
           && strncasecmp( arg, "https://", 8 )
           && strncasecmp( arg, "stratum+tcp://", 14 )
+           && strncasecmp( arg, "stratum+ssl://", 14 )
           && strncasecmp( arg, "stratum+tcps://", 15 ) )
         {
            fprintf(stderr, "unknown protocol -- '%s'\n", arg);
@@ -3768,6 +3785,7 @@ int main(int argc, char *argv[])
   flags = CURL_GLOBAL_ALL;
   if ( !opt_benchmark )
     if ( strncasecmp( rpc_url, "https:", 6 )
+       && strncasecmp( rpc_url, "stratum+ssl://", 14 )
       && strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
         flags &= ~CURL_GLOBAL_SSL;

--- a/miner.h
+++ b/miner.h
@@ -812,7 +812,7 @@ Options:\n\
                          lyra2z330     Lyra2 330 rows\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
-                          minotaur      Ringcoin (RNG)\n\
+                          minotaur\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -508,6 +508,32 @@ static inline void mm128_bswap32_80( void *d, void *s )

 #endif

+static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
+{
+ uint32_t *s = (uint32_t*)src;
+ casti_m128i( d, 0 ) = _mm_set1_epi32( bswap_32( s[ 0] ) );
+ casti_m128i( d, 1 ) = _mm_set1_epi32( bswap_32( s[ 1] ) );
+ casti_m128i( d, 2 ) = _mm_set1_epi32( bswap_32( s[ 2] ) );
+ casti_m128i( d, 3 ) = _mm_set1_epi32( bswap_32( s[ 3] ) );
+ casti_m128i( d, 4 ) = _mm_set1_epi32( bswap_32( s[ 4] ) );
+ casti_m128i( d, 5 ) = _mm_set1_epi32( bswap_32( s[ 5] ) );
+ casti_m128i( d, 6 ) = _mm_set1_epi32( bswap_32( s[ 6] ) );
+ casti_m128i( d, 7 ) = _mm_set1_epi32( bswap_32( s[ 7] ) );
+ casti_m128i( d, 8 ) = _mm_set1_epi32( bswap_32( s[ 8] ) );
+ casti_m128i( d, 9 ) = _mm_set1_epi32( bswap_32( s[ 9] ) );
+ casti_m128i( d,10 ) = _mm_set1_epi32( bswap_32( s[10] ) );
+ casti_m128i( d,11 ) = _mm_set1_epi32( bswap_32( s[11] ) );
+ casti_m128i( d,12 ) = _mm_set1_epi32( bswap_32( s[12] ) );
+ casti_m128i( d,13 ) = _mm_set1_epi32( bswap_32( s[13] ) );
+ casti_m128i( d,14 ) = _mm_set1_epi32( bswap_32( s[14] ) );
+ casti_m128i( d,15 ) = _mm_set1_epi32( bswap_32( s[15] ) );
+ casti_m128i( d,16 ) = _mm_set1_epi32( bswap_32( s[16] ) );
+ casti_m128i( d,17 ) = _mm_set1_epi32( bswap_32( s[17] ) );
+ casti_m128i( d,18 ) = _mm_set1_epi32( bswap_32( s[18] ) );
+ casti_m128i( d,19 ) = _mm_set1_epi32( bswap_32( s[19] ) );
+}
+
+/*
 static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
  __m128i s0 = casti_m128i( src,0 );
@@ -561,6 +587,7 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
  casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
  casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
 }
+*/

 // 8x32
 /*
@@ -1110,6 +1137,31 @@ static inline void extr_lane_8x32( void *d, const void *s,

 #if defined(__AVX2__)

+static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
+{
+ uint32_t *s = (uint32_t*)src;
+ casti_m256i( d, 0 ) = _mm256_set1_epi32( bswap_32( s[ 0] ) );
+ casti_m256i( d, 1 ) = _mm256_set1_epi32( bswap_32( s[ 1] ) );
+ casti_m256i( d, 2 ) = _mm256_set1_epi32( bswap_32( s[ 2] ) );
+ casti_m256i( d, 3 ) = _mm256_set1_epi32( bswap_32( s[ 3] ) );
+ casti_m256i( d, 4 ) = _mm256_set1_epi32( bswap_32( s[ 4] ) );
+ casti_m256i( d, 5 ) = _mm256_set1_epi32( bswap_32( s[ 5] ) );
+ casti_m256i( d, 6 ) = _mm256_set1_epi32( bswap_32( s[ 6] ) );
+ casti_m256i( d, 7 ) = _mm256_set1_epi32( bswap_32( s[ 7] ) );
+ casti_m256i( d, 8 ) = _mm256_set1_epi32( bswap_32( s[ 8] ) );
+ casti_m256i( d, 9 ) = _mm256_set1_epi32( bswap_32( s[ 9] ) );
+ casti_m256i( d,10 ) = _mm256_set1_epi32( bswap_32( s[10] ) );
+ casti_m256i( d,11 ) = _mm256_set1_epi32( bswap_32( s[11] ) );
+ casti_m256i( d,12 ) = _mm256_set1_epi32( bswap_32( s[12] ) );
+ casti_m256i( d,13 ) = _mm256_set1_epi32( bswap_32( s[13] ) );
+ casti_m256i( d,14 ) = _mm256_set1_epi32( bswap_32( s[14] ) );
+ casti_m256i( d,15 ) = _mm256_set1_epi32( bswap_32( s[15] ) );
+ casti_m256i( d,16 ) = _mm256_set1_epi32( bswap_32( s[16] ) );
+ casti_m256i( d,17 ) = _mm256_set1_epi32( bswap_32( s[17] ) );
+ casti_m256i( d,18 ) = _mm256_set1_epi32( bswap_32( s[18] ) );
+ casti_m256i( d,19 ) = _mm256_set1_epi32( bswap_32( s[19] ) );
+}
+/*
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1170,6 +1222,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
   casti_m128i( d,38 ) = 
   casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
 } 
+*/

 #endif   // AVX2

@@ -1718,6 +1771,31 @@ static inline void extr_lane_16x32( void *d, const void *s,

 #if defined(__AVX512F__) && defined(__AVX512VL__)

+static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
+{
+ uint32_t *s = (uint32_t*)src;
+ casti_m512i( d, 0 ) = _mm512_set1_epi32( bswap_32( s[ 0] ) );
+ casti_m512i( d, 1 ) = _mm512_set1_epi32( bswap_32( s[ 1] ) );
+ casti_m512i( d, 2 ) = _mm512_set1_epi32( bswap_32( s[ 2] ) );
+ casti_m512i( d, 3 ) = _mm512_set1_epi32( bswap_32( s[ 3] ) );
+ casti_m512i( d, 4 ) = _mm512_set1_epi32( bswap_32( s[ 4] ) );
+ casti_m512i( d, 5 ) = _mm512_set1_epi32( bswap_32( s[ 5] ) );
+ casti_m512i( d, 6 ) = _mm512_set1_epi32( bswap_32( s[ 6] ) );
+ casti_m512i( d, 7 ) = _mm512_set1_epi32( bswap_32( s[ 7] ) );
+ casti_m512i( d, 8 ) = _mm512_set1_epi32( bswap_32( s[ 8] ) );
+ casti_m512i( d, 9 ) = _mm512_set1_epi32( bswap_32( s[ 9] ) );
+ casti_m512i( d,10 ) = _mm512_set1_epi32( bswap_32( s[10] ) );
+ casti_m512i( d,11 ) = _mm512_set1_epi32( bswap_32( s[11] ) );
+ casti_m512i( d,12 ) = _mm512_set1_epi32( bswap_32( s[12] ) );
+ casti_m512i( d,13 ) = _mm512_set1_epi32( bswap_32( s[13] ) );
+ casti_m512i( d,14 ) = _mm512_set1_epi32( bswap_32( s[14] ) );
+ casti_m512i( d,15 ) = _mm512_set1_epi32( bswap_32( s[15] ) );
+ casti_m512i( d,16 ) = _mm512_set1_epi32( bswap_32( s[16] ) );
+ casti_m512i( d,17 ) = _mm512_set1_epi32( bswap_32( s[17] ) );
+ casti_m512i( d,18 ) = _mm512_set1_epi32( bswap_32( s[18] ) );
+ casti_m512i( d,19 ) = _mm512_set1_epi32( bswap_32( s[19] ) );
+}
+/*
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1818,6 +1896,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
   casti_m128i( d,78 ) = 
   casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff );
 }
+*/

 #endif    // AVX512

@@ -2470,6 +2549,25 @@ static inline void extr_lane_8x64( void *d, const void *s,

 #if defined(__AVX512F__) && defined(__AVX512VL__)

+// broadcast to all lanes
+static inline void mm512_intrlv80_8x64( void *dst, const void *src )
+{
+  __m128i *d = (__m128i*)dst;
+  const __m128i *s = (const __m128i*)src;
+
+  d[ 0] =  d[ 1] =  d[ 2] =  d[ 3] = _mm_shuffle_epi32( s[0], 0x44 );
+  d[ 4] =  d[ 5] =  d[ 6] =  d[ 7] = _mm_shuffle_epi32( s[0], 0xee );
+  d[ 8] =  d[ 9] =  d[10] =  d[11] = _mm_shuffle_epi32( s[1], 0x44 );
+  d[12] =  d[13] =  d[14] =  d[15] = _mm_shuffle_epi32( s[1], 0xee );
+  d[16] =  d[17] =  d[18] =  d[19] = _mm_shuffle_epi32( s[2], 0x44 );
+  d[20] =  d[21] =  d[22] =  d[23] = _mm_shuffle_epi32( s[2], 0xee );
+  d[24] =  d[25] =  d[26] =  d[27] = _mm_shuffle_epi32( s[3], 0x44 );
+  d[28] =  d[29] =  d[30] =  d[31] = _mm_shuffle_epi32( s[3], 0xee );
+  d[32] =  d[33] =  d[34] =  d[35] = _mm_shuffle_epi32( s[4], 0x44 );
+  d[36] =  d[37] =  d[38] =  d[39] = _mm_shuffle_epi32( s[4], 0xee );
+}
+
+// byte swap and broadcast to al lanes
 static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 {
  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -2556,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
   d[10] = s0[5];   d[11] = s1[5];
   d[12] = s0[6];   d[13] = s1[6];
   d[14] = s0[7];   d[15] = s1[7];
+   if ( bit_len <= 1024 ) return;
+   d[16] = s0[8];   d[17] = s1[8];
+   d[18] = s0[9];   d[19] = s1[9];
+   //   if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_2x128_512( void *dst, const void *src0,
@@ -2623,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
   d[20] = s0[5];    d[21] = s1[5];    d[22] = s2[5];    d[23] = s3[5];
   d[24] = s0[6];    d[25] = s1[6];    d[26] = s2[6];    d[27] = s3[6];
   d[28] = s0[7];    d[29] = s1[7];    d[30] = s2[7];    d[31] = s3[7];
+   if ( bit_len <= 1024 ) return;
+   d[32] = s0[8];    d[33] = s1[8];    d[34] = s2[8];    d[35] = s3[8];
+   d[36] = s0[9];    d[37] = s1[9];    d[38] = s2[9];    d[39] = s3[9];
+   // if ( bit_len <= 1280 ) return;
 }

 static inline void intrlv_4x128_512( void *dst, const void *src0,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

-// Limited 2 input shuffle
+// Limited 2 input shuffle, combines shuffle with blend. The destination low
+// half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
                                     _mm_castsi128_pd( b ), c ) ); 
@@ -545,14 +546,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )


 // Two input shuffle-rotate.
-// Concatenate v1 & v2 and rotate as one 256 bit vector.
-// Continue to use vror/vrol for now to avoid confusion with
-// shufl2r/shufl2l function macros available with AVX512.
+// Concatenate v1 & v2 and bit rotate as one 256 bit vector.

 #if defined(__SSSE3__)

-// Function macro with two inputs and one output, inputs are preserved.
-// Two input functions are not available without SSSE3. Use procedure
+// Function macros with two inputs and one output, inputs are preserved.
+// Returns the high 128 bits, ie updated v1.
+// These two-input functions are not available without SSSE3. Use procedure
 // macros below instead.

 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
@@ -567,12 +567,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )

-// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
-
-// These macros retain the vrol/vror name for now to avoid
-// confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 nputs and
-// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
+// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
+// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
+// with existing code. The function macros above can be used more effciently.

 #define mm128_vror256_64( v1, v2 ) \
 do { \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

-//
-// Swap bytes in vector elements, endian bswap.
+// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
+// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
+// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
+// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
+// AVX2 version will work here. The bswap control vector is coded to work
+// with both versions, bit 4 is ignored in AVX2. 
+
+// Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,14 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    cmp instruction now returns a bitmask isnstead of a vector mask.
+//    cmp instruction now returns a bitmask instead of a vector mask.
 //    This eliminates the need for the blendv instruction.
 //
 //    The new rotate instructions require the count to be an 8 bit
 //    immediate value only. Compilation fails if a variable is used.
 //    The documentation is the same as for shift and it works with
-//    variables.
+//    variables. The inconsistency is likely due to compiler optimizations
+//    that can eliminate the variable in some instances.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -317,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
 // elements and can be called directly. But they only accept immediate 8
 // for control arg. 
+// The workaround is a fraud, just a fluke of the compiler's optimizer.
+// It fails without -O3. The compiler seems to unroll shift loops, eliminating
+// the variable control, better than rotate loops. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -429,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)

-//
-// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
-//

-// rename plan change ror to vror for Vector ROtate Right,
-// and vrol for Vector ROtate Left, not to be confused with
-//variable rotate rorv, rolv,
-// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
-// operation. 1xNN notaion ia also removed and replaced with simpler NN.
-// Swap will still have its own mnemonic and will be aliased as both
-// left and right shuffles.
-
-// Shift elements right or left in 512 bit vector, filling with zeros.
-// Multiple element shifts can be combined into a single larger
-// element shift.
+// Cross-lane shuffles implementing rotate & shift of elements within a vector.
+//

 #define mm512_shiftr_256( v ) \
  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -529,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 // 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
 #define mm512_shuflr256_128 mm512_swap256_128
 #define mm512_shufll256_128 mm512_swap256_128

@@ -583,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 //
 // Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
-// Limited 2 input, 1 output shuffle within 128 bit lanes.
+// Limited 2 input, 1 output shuffle, combines shuffle with blend.
+// Like most shuffles it's limited to 128 bit lanes and like some shuffles
+// destination elements must come from a specific source. 
 #define mm512_shuffle2_64( a, b, c ) \
   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
                                           _mm512_castsi512_pd( b ), c ) ); 
@@ -620,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // Drop macros? They can easilly be rebuilt using shufl2 functions

 // 2 input, 1 output
-// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
-// rotated v1 
-// visually confusing for shif2r because of arg order. First arg is always
-// the target for modification, either update by reference or by function
-// return.
+// Rotate concatenated { v1, v2 ) right or left and return v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )

--- a/sysinfos.c
+++ b/sysinfos.c
@@ -502,6 +502,28 @@ static inline bool has_vaes()
 #endif
 }

+static inline bool has_vbmi()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
+#endif
+}
+
+static inline bool has_vbmi2()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
+#endif
+}
+
 // AMD only
 static inline bool has_xop()
 {
--- a/util.c
+++ b/util.c
@@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		free(sctx->url);
 		sctx->url = strdup(url);
 	}
-	free(sctx->curl_url);
+
+   free(sctx->curl_url);
 	sctx->curl_url = (char*) malloc(strlen(url));
-	sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) 
-                              ? strstr( url, "s://" )
-                              : strstr (url, "://"  ) );
+
+   // replace the stratum protocol prefix with http, https for ssl
+   sprintf( sctx->curl_url, "%s%s",
+            ( strstr( url, "s://" ) || strstr( url, "ssl://" ) )
+               ? "https" : "http", strstr( url, "://" ) );
+
+
+
+//   sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) 
+//                              ? strstr( url, "s://" )
+//                              : strstr (url, "://"  ) );

 	if (opt_protocol)
 		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
Author	SHA1	Message	Date
Jay D Dee	1321ac474c	v3.20.1	2022-07-26 18:36:40 -04:00
Jay D Dee	40d07c0097	v3.20.0	2022-07-17 13:30:50 -04:00
Jay D Dee	f552f2b1e8	v3.19.9	2022-07-10 11:04:00 -04:00
Jay D Dee	26b8927632	v3.19.8	2022-05-27 18:12:30 -04:00