v3.19.9

v3.19.8
v3.19.7
2025-09-17 23:44:27 +00:00 · 2022-07-10 11:04:00 -04:00 · 2022-05-27 18:12:30 -04:00 · 2022-04-02 12:44:57 -04:00 · 2022-02-21 23:14:24 -05:00 · 2022-01-30 20:59:54 -05:00
67 changed files with 4446 additions and 1679 deletions
--- a/.RELEASE_NOTES.swp
+++ b/.RELEASE_NOTES.swp
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
+  malloc-huge.c \
  algo/argon2/argon2a/argon2a.c \
  algo/argon2/argon2a/ar2/argon2.c \
  algo/argon2/argon2a/ar2/opt.c \
--- a/58
+++ b/58
@@ -22,7 +22,7 @@ required.
 Compile Instructions
 --------------------

-See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions

 Requirements
 ------------
@@ -65,6 +65,62 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.19.9
+
+More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
+Relaxed some excessively strict data alignment that was negatively affecting performance.
+
+v3.19.8
+
+#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
+url protocol specifier for requesting a secure stratum connection.
+
+The full url, including the protocol, is now displayed in the stratum connect
+log and the periodic summary log.
+
+Small optimizations to Cubehash, AVX2 & AVX512.
+
+Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
+
+v3.19.7
+
+#369 Fixed time limited mining, --time-limit.
+Fixed a potential compile error when using optimization below -O3.
+
+v3.19.6
+
+#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
+Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
+Small optimization to Shavite.
+
+v3.19.5
+
+Enhanced stratum-keepalive preemptively resets the stratum connection
+before the server to avoid lost shares.
+
+Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
+
+X16RT: eliminate unnecessary recalculations of the hash order.
+
+Fix a few compiler warnings.
+
+Fixed log colour error when a block is solved.
+
+v3.19.4
+
+#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
+
+New option stratum-keepalive prevents stratum timeouts when no shares are
+submitted for several minutes due to high difficulty.
+
+Fixed a bug displaying optimizations for some algos.
+
+v3.19.3
+
+Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
+
+Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
+
 v3.19.2

 Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
@@ -344,7 +344,7 @@ static size_t
 detect_cpu(void) {
 	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
 	//cpu_vendors_x86 vendor = cpu_nobody;
-	x86_regs regs;
+	x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
 	uint32_t max_level, max_ext_level;
 	size_t cpu_flags = 0;
 #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
@@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) {
 	#endif
 #endif

-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
 #endif

 /* romix pre/post nop function */
+/*
 static void asm_calling_convention
 scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
 	(void)blocks; (void)nblocks;
 }
-
+*/
 /* romix pre/post endian conversion function */
 static void asm_calling_convention
 scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -37,6 +37,13 @@

 #if defined(__AVX512F__)

+static inline __m512i blamka( __m512i x, __m512i y )
+{
+    __m512i xy = _mm512_mul_epu32( x, y );
+    return _mm512_add_epi64( _mm512_add_epi64( x, y ),
+                             _mm512_add_epi64( xy, xy ) );
+}
+
 static void fill_block( __m512i *state, const block *ref_block,
                       block *next_block, int with_xor )
 {
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define ROR64(x, n) _mm512_ror_epi64((x), (n))
-
-static __m512i muladd(__m512i x, __m512i y)
+static inline __m512i muladd(__m512i x, __m512i y)
 {
    __m512i z = _mm512_mul_epu32(x, y);
    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
        D0 = _mm512_xor_si512(D0, A0); \
        D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ROR64(D0, 32); \
-        D1 = ROR64(D1, 32); \
+        D0 = _mm512_ror_epi64(D0, 32); \
+        D1 = _mm512_ror_epi64(D1, 32); \
 \
        C0 = muladd(C0, D0); \
        C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_xor_si512(B0, C0); \
        B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ROR64(B0, 24); \
-        B1 = ROR64(B1, 24); \
+        B0 = _mm512_ror_epi64(B0, 24); \
+        B1 = _mm512_ror_epi64(B1, 24); \
    } while ((void)0, 0)

 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
        D0 = _mm512_xor_si512(D0, A0); \
        D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ROR64(D0, 16); \
-        D1 = ROR64(D1, 16); \
+        D0 = _mm512_ror_epi64(D0, 16); \
+        D1 = _mm512_ror_epi64(D1, 16); \
 \
        C0 = muladd(C0, D0); \
        C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_xor_si512(B0, C0); \
        B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ROR64(B0, 63); \
-        B1 = ROR64(B1, 63); \
+        B0 = _mm512_ror_epi64(B0, 63); \
+        B1 = _mm512_ror_epi64(B1, 63); \
    } while ((void)0, 0)

 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)

 #define SWAP_HALVES(A0, A1) \
    do { \
-        __m512i t0, t1; \
-        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
-        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
-        A0 = t0; \
-        A1 = t1; \
+        __m512i t; \
+        t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t; \
    } while((void)0, 0)

 #define SWAP_QUARTERS(A0, A1) \
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -49,6 +49,20 @@ extern "C"{

 #define SPH_SIZE_blake512   512

+/////////////////////////
+//
+//  Blake-256 1 way SSE2
+
+void  blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                             const uint32_t T0, const uint32_t T1 );
+
+/////////////////////////
+//
+//  Blake-512 1 way SSE2
+
+void  blake512_transform_le( uint64_t *H, const uint64_t *buf,
+                             const uint64_t T0, const uint64_t T1 );
+
 //////////////////////////
 //
 //   Blake-256 4 way SSE2
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
 void blake256_8way_close(void *cc, void *dst);
+void blake256_8way_update_le(void *cc, const void *data, size_t len);
+void blake256_8way_close_le(void *cc, void *dst);
+void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data );
+void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data );

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
 void blake512_4way_close( void *cc, void *dst );
 void blake512_4way_full( blake_4way_big_context *sc, void * dst,
                         const void *data, size_t len );
+void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
+                            const void *data, size_t len );
+void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
+                               const void *data );
+void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
+                             const __m256i nonce, const __m256i *midstate );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
 void blake256_16way_init(void *cc);
 void blake256_16way_update(void *cc, const void *data, size_t len);
 void blake256_16way_close(void *cc, void *dst);
+// Expects data in little endian order, no byte swap needed
+void blake256_16way_update_le(void *cc, const void *data, size_t len);
+void blake256_16way_close_le(void *cc, void *dst);
+void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data );
+void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data );
+

 // 14 rounds, blake, decred
 typedef blake_16way_small_context blake256r14_16way_context;
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
 void blake512_8way_full( blake_8way_big_context *sc, void * dst,
                        const void *data, size_t len );
-void blake512_8way_hash_le80( void *hash, const void *data );
+void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
+                            const void *data, size_t len );
+void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
+                               const void *data );
+void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
+                             const __m512i nonce, const __m512i *midstate );

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -5,6 +5,7 @@
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *               2016-2022  JayDDee246@gmail.com
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {

 #endif

+/////////////////////////////////////////
+//
+// Blake-256 1 way SIMD
+
+#define BLAKE256_ROUND( r ) \
+{ \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
+                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
+                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
+                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
+                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shufll_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shuflr_32( V1 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
+                                          CSx( r, D ) ^ Mx( r, C ), \
+                                          CSx( r, B ) ^ Mx( r, A ), \
+                                          CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
+                                          CSx( r, C ) ^ Mx( r, D ), \
+                                          CSx( r, A ) ^ Mx( r, B ), \
+                                          CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
+   V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V3 = mm128_shuflr_32( V3 ); \
+   V2 = mm128_swap_64( V2 ); \
+   V1 = mm128_shufll_32( V1 ); \
+}
+
+void blake256_transform_le( uint32_t *H, const uint32_t *buf,
+                            const uint32_t T0, const uint32_t T1 )
+{
+   __m128i V0, V1, V2, V3;
+   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+   V0 = casti_m128i( H, 0 );
+   V1 = casti_m128i( H, 1 );
+   V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+                       T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
+   M0 = buf[ 0];
+   M1 = buf[ 1];
+   M2 = buf[ 2];
+   M3 = buf[ 3];
+   M4 = buf[ 4];
+   M5 = buf[ 5];
+   M6 = buf[ 6];
+   M7 = buf[ 7];
+   M8 = buf[ 8];
+   M9 = buf[ 9];
+   MA = buf[10];
+   MB = buf[11];
+   MC = buf[12];
+   MD = buf[13];
+   ME = buf[14];
+   MF = buf[15];
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   BLAKE256_ROUND( 4 );
+   BLAKE256_ROUND( 5 );
+   BLAKE256_ROUND( 6 );
+   BLAKE256_ROUND( 7 );
+   BLAKE256_ROUND( 8 );
+   BLAKE256_ROUND( 9 );
+   BLAKE256_ROUND( 0 );
+   BLAKE256_ROUND( 1 );
+   BLAKE256_ROUND( 2 );
+   BLAKE256_ROUND( 3 );
+   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
+   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
+}
+
+////////////////////////////////////////////
+//
 // Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -508,14 +601,10 @@ do { \
   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
   VA = m128_const1_64( 0x13198A2E13198A2E ); \
   VB = m128_const1_64( 0x0370734403707344 ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
-                           m128_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
-                           m128_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
-                           m128_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
-                           m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
@@ -548,6 +637,8 @@ do { \

 #if defined (__AVX2__)

+/////////////////////////////////
+//
 // Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -626,14 +717,10 @@ do { \
   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
   VA = m256_const1_64( 0x13198A2E13198A2E ); \
   VB = m256_const1_64( 0x0370734403707344 ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
-                              m256_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
-                              m256_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
@@ -679,13 +766,247 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 } while (0)

+#define COMPRESS32_8WAY_LE( rounds ) \
+do { \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
+   VA = m256_const1_64( 0x13198A2E13198A2E ); \
+   VB = m256_const1_64( 0x0370734403707344 ); \
+   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+   ROUND_S_8WAY(0); \
+   ROUND_S_8WAY(1); \
+   ROUND_S_8WAY(2); \
+   ROUND_S_8WAY(3); \
+   ROUND_S_8WAY(4); \
+   ROUND_S_8WAY(5); \
+   ROUND_S_8WAY(6); \
+   ROUND_S_8WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_8WAY(8); \
+      ROUND_S_8WAY(9); \
+      ROUND_S_8WAY(0); \
+      ROUND_S_8WAY(1); \
+      ROUND_S_8WAY(2); \
+      ROUND_S_8WAY(3); \
+   } \
+   H0 = mm256_xor3( V8, V0, H0 ); \
+   H1 = mm256_xor3( V9, V1, H1 ); \
+   H2 = mm256_xor3( VA, V2, H2 ); \
+   H3 = mm256_xor3( VB, V3, H3 ); \
+   H4 = mm256_xor3( VC, V4, H4 ); \
+   H5 = mm256_xor3( VD, V5, H5 ); \
+   H6 = mm256_xor3( VE, V6, H6 ); \
+   H7 = mm256_xor3( VF, V7, H7 ); \
+} while (0)
+
+void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data )
+{
+   const __m256i *M = (const __m256i*)data;
+   __m256i *V = (__m256i*)midstate;
+   const __m256i *H = (const __m256i*)midhash;
+
+   V[ 0] = H[0];
+   V[ 1] = H[1];
+   V[ 2] = H[2];
+   V[ 3] = H[3];
+   V[ 4] = H[4];
+   V[ 5] = H[5];
+   V[ 6] = H[6];
+   V[ 7] = H[7];
+   V[ 8] = m256_const1_32( CS0 );
+   V[ 9] = m256_const1_32( CS1 );
+   V[10] = m256_const1_32( CS2 );
+   V[11] = m256_const1_32( CS3 );
+   V[12] = m256_const1_32( CS4 ^ 0x280 );
+   V[13] = m256_const1_32( CS5 ^ 0x280 );
+   V[14] = m256_const1_32( CS6 );
+   V[15] = m256_const1_32( CS7 );
+
+   // G0   
+   GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+
+   // G1   
+   V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
+   V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
+   V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
+   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
+   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
+
+   // G2,G3
+   GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+
+   // G4   
+   V[ 0] = _mm256_add_epi32( V[ 0],
+                         _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
+
+   // G6   
+   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
+
+   // G7   
+   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm256_add_epi32( V[ 3],
+                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
+}
+
+void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data )
+{
+   __m256i *H = (__m256i*)final_hash;
+   const __m256i *h = (const __m256i*)midhash;
+   const __m256i *v= (const __m256i*)midstate;
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+
+   V0 = v[ 0];
+   V1 = v[ 1];
+   V2 = v[ 2];
+   V3 = v[ 3];
+   V4 = v[ 4];
+   V5 = v[ 5];
+   V6 = v[ 6];
+   V7 = v[ 7];
+   V8 = v[ 8];
+   V9 = v[ 9];
+   VA = v[10];
+   VB = v[11];
+   VC = v[12];
+   VD = v[13];
+   VE = v[14];
+   VF = v[15];
+
+   M0 = casti_m256i( data,  0 );
+   M1 = casti_m256i( data,  1 );
+   M2 = casti_m256i( data,  2 );
+   M3 = casti_m256i( data,  3 );
+   M4 = casti_m256i( data,  4 );
+   M5 = casti_m256i( data,  5 );
+   M6 = casti_m256i( data,  6 );
+   M7 = casti_m256i( data,  7 );
+   M8 = casti_m256i( data,  8 );
+   M9 = casti_m256i( data,  9 );
+   MA = casti_m256i( data, 10 );
+   MB = casti_m256i( data, 11 );
+   MC = casti_m256i( data, 12 );
+   MD = casti_m256i( data, 13 );
+   ME = casti_m256i( data, 14 );
+   MF = casti_m256i( data, 15 );
+
+   // Finish round 0   
+   // G1   
+   V1 = _mm256_add_epi32( V1,
+                         _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
+   V9 = _mm256_add_epi32( V9, VD );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
+
+   // G4
+   V0 = _mm256_add_epi32( V0, V5 );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
+   VA = _mm256_add_epi32( VA, VF );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
+   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
+                         _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
+   VA = _mm256_add_epi32( VA, VF );
+   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
+
+   // G5
+   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+
+   // G6
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
+   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
+                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
+   V8 = _mm256_add_epi32( V8, VD );
+   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
+
+   // G7
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
+   V3 = _mm256_add_epi32( V3, V4 );
+   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
+   V9 = _mm256_add_epi32( V9, VE );
+   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
+
+   // Remaining rounds   
+   ROUND_S_8WAY( 1 );
+   ROUND_S_8WAY( 2 );
+   ROUND_S_8WAY( 3 );
+   ROUND_S_8WAY( 4 );
+   ROUND_S_8WAY( 5 );
+   ROUND_S_8WAY( 6 );
+   ROUND_S_8WAY( 7 );
+   ROUND_S_8WAY( 8 );
+   ROUND_S_8WAY( 9 );
+   ROUND_S_8WAY( 0 );
+   ROUND_S_8WAY( 1 );
+   ROUND_S_8WAY( 2 );
+   ROUND_S_8WAY( 3 );
+
+   const __m256i shuf_bswap32 =
+                  m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+}

 #endif

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-// Blaske-256 16 way AVX512
+///////////////////////////////////////
+//
+// Blake-256 16 way AVX512

 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
@@ -763,14 +1084,10 @@ do { \
   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
   VA = m512_const1_64( 0x13198A2E13198A2E ); \
   VB = m512_const1_64( 0x0370734403707344 ); \
-   VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
-                              m512_const1_64( 0xA4093822A4093822 ) ); \
-   VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
-                              m512_const1_64( 0x299F31D0299F31D0 ) ); \
-   VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
-                              m512_const1_64( 0x082EFA98082EFA98 ) ); \
-   VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
-                              m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
@@ -818,6 +1135,264 @@ do { \
   H7 = mm512_xor3( VF, V7, H7 ); \
 } while (0)

+#define COMPRESS32_16WAY_LE( rounds ) \
+do { \
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
+   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
+   VA = m512_const1_64( 0x13198A2E13198A2E ); \
+   VB = m512_const1_64( 0x0370734403707344 ); \
+   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
+   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
+   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
+   VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+   ROUND_S_16WAY(0); \
+   ROUND_S_16WAY(1); \
+   ROUND_S_16WAY(2); \
+   ROUND_S_16WAY(3); \
+   ROUND_S_16WAY(4); \
+   ROUND_S_16WAY(5); \
+   ROUND_S_16WAY(6); \
+   ROUND_S_16WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_16WAY(8); \
+      ROUND_S_16WAY(9); \
+      ROUND_S_16WAY(0); \
+      ROUND_S_16WAY(1); \
+      ROUND_S_16WAY(2); \
+      ROUND_S_16WAY(3); \
+   } \
+   H0 = mm512_xor3( V8, V0, H0 ); \
+   H1 = mm512_xor3( V9, V1, H1 ); \
+   H2 = mm512_xor3( VA, V2, H2 ); \
+   H3 = mm512_xor3( VB, V3, H3 ); \
+   H4 = mm512_xor3( VC, V4, H4 ); \
+   H5 = mm512_xor3( VD, V5, H5 ); \
+   H6 = mm512_xor3( VE, V6, H6 ); \
+   H7 = mm512_xor3( VF, V7, H7 ); \
+} while (0)
+
+// Blake-256 prehash of the second block is split onto 2 parts. The first part
+// is constant for every nonce and only needs to be run once per job. The
+// second part is run for each nonce using the precalculated midstate and the
+// hash from the first block.
+void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+                                       const void *data )
+{
+   const __m512i *M = (const __m512i*)data;
+   __m512i *V = (__m512i*)midstate;
+   const __m512i *H = (const __m512i*)midhash;
+
+   V[ 0] = H[0];
+   V[ 1] = H[1];
+   V[ 2] = H[2];
+   V[ 3] = H[3];
+   V[ 4] = H[4];
+   V[ 5] = H[5];
+   V[ 6] = H[6];
+   V[ 7] = H[7];
+   V[ 8] = m512_const1_32( CS0 );
+   V[ 9] = m512_const1_32( CS1 );
+   V[10] = m512_const1_32( CS2 );
+   V[11] = m512_const1_32( CS3 );
+   V[12] = m512_const1_32( CS4 ^ 0x280 );
+   V[13] = m512_const1_32( CS5 ^ 0x280 );
+   V[14] = m512_const1_32( CS6 );
+   V[15] = m512_const1_32( CS7 );
+
+   // G0   
+   GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+
+   // G1, nonce is in M[3]   
+   // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
+   V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
+   V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
+   V[ 9] = _mm512_add_epi32( V[ 9], V[13] );
+   V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
+   V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
+
+   // G2,G3
+   GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+
+   // G4   
+   // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
+   V[ 0] = _mm512_add_epi32( V[ 0],
+                         _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); 
+   
+   // G5
+   // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
+
+   // G6   
+   // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
+   V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
+   // G7   
+   // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
+   V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
+   V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
+   V[ 3] = _mm512_add_epi32( V[ 3],
+                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
+}
+
+void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+                                     const void *midhash, const void *data )
+{
+   __m512i *H = (__m512i*)final_hash;
+   const __m512i *h = (const __m512i*)midhash;
+   const __m512i *v= (const __m512i*)midstate;
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+
+   V0 = v[ 0];
+   V1 = v[ 1];
+   V2 = v[ 2];
+   V3 = v[ 3];
+   V4 = v[ 4];
+   V5 = v[ 5];
+   V6 = v[ 6];
+   V7 = v[ 7];
+   V8 = v[ 8];
+   V9 = v[ 9];
+   VA = v[10];
+   VB = v[11];
+   VC = v[12];
+   VD = v[13];
+   VE = v[14];
+   VF = v[15];
+
+   M0 = casti_m512i( data,  0 ); 
+   M1 = casti_m512i( data,  1 ); 
+   M2 = casti_m512i( data,  2 ); 
+   M3 = casti_m512i( data,  3 ); 
+   M4 = casti_m512i( data,  4 ); 
+   M5 = casti_m512i( data,  5 ); 
+   M6 = casti_m512i( data,  6 ); 
+   M7 = casti_m512i( data,  7 ); 
+   M8 = casti_m512i( data,  8 ); 
+   M9 = casti_m512i( data,  9 ); 
+   MA = casti_m512i( data, 10 ); 
+   MB = casti_m512i( data, 11 ); 
+   MC = casti_m512i( data, 12 ); 
+   MD = casti_m512i( data, 13 ); 
+   ME = casti_m512i( data, 14 ); 
+   MF = casti_m512i( data, 15 ); 
+
+   // Finish round 0 with the nonce (M3) now available
+   // G0   
+   // GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );
+
+   // G1   
+   // GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
+   V1 = _mm512_add_epi32( V1, 
+                         _mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
+   V9 = _mm512_add_epi32( V9, VD );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 );
+   
+   // G2,G3   
+   // GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE );
+   // GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF );
+
+   // G4
+   // GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
+   V0 = _mm512_add_epi32( V0, V5 );
+   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
+   VA = _mm512_add_epi32( VA, VF );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
+   V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
+                         _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
+   VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
+   VA = _mm512_add_epi32( VA, VF );
+   V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
+
+   // G5
+   GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
+
+   // G6
+   // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
+   V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
+                         _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
+   VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
+   V8 = _mm512_add_epi32( V8, VD );
+   V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
+
+   // G7
+   // GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
+   V3 = _mm512_add_epi32( V3, V4 );
+   VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
+   V9 = _mm512_add_epi32( V9, VE );
+   V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
+
+   // Remaining rounds   
+   ROUND_S_16WAY( 1 );
+   ROUND_S_16WAY( 2 );
+   ROUND_S_16WAY( 3 );
+   ROUND_S_16WAY( 4 );
+   ROUND_S_16WAY( 5 );
+   ROUND_S_16WAY( 6 );
+   ROUND_S_16WAY( 7 );
+   ROUND_S_16WAY( 8 );
+   ROUND_S_16WAY( 9 );
+   ROUND_S_16WAY( 0 );
+   ROUND_S_16WAY( 1 );
+   ROUND_S_16WAY( 2 );
+   ROUND_S_16WAY( 3 );
+
+   // Byte swap final hash
+   const __m512i shuf_bswap32 =
+                  m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                 0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                 0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+}
+
 #endif

 // Blake-256 4 way
@@ -913,8 +1488,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
                                m128_const1_64( 0x0100000001000000ULL ) );
-      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
-      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
+      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
   }
   else
@@ -926,8 +1501,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      memset_zero_128( buf, 56>>2 );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
                                m128_const1_64( 0x0100000001000000ULL ) );
-      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
-      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
+      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
+      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf, 64 );
   }

@@ -1033,22 +1608,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
                                m256_const1_64( 0x0100000001000000ULL ) );
-       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
+       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
-        sc->T0 = SPH_C32(0xFFFFFE00UL);
-        sc->T1 = SPH_C32(0xFFFFFFFFUL);
-        memset_zero_256( buf, 56>>2 );
+       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = SPH_C32(0xFFFFFE00UL);
+       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
-        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
-        blake32_8way( sc, buf, 64 );
+       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
+       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
+       blake32_8way( sc, buf, 64 );
+   }
+   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
+}
+
+static void
+blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_8WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+
+   READ_STATE32_8WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+                T1 = SPH_T32(T1 + 1);
+          COMPRESS32_8WAY_LE( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_8WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+                       void *dst, size_t out_size_w32 )
+{
+   __m256i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m256_const1_32( 0x80000000 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
+        sc->T1 = SPH_T32(sc->T1 - 1);
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 );
+       *(buf+(56>>2)) = _mm256_set1_epi32( th );
+       *(buf+(60>>2)) = _mm256_set1_epi32( tl );
+       blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = SPH_C32(0xFFFFFE00UL);
+       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       memset_zero_256( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = m256_one_32;
+       *(buf+(56>>2)) = _mm256_set1_epi32( th );
+       *(buf+(60>>2)) = _mm256_set1_epi32( tl );
+       blake32_8way_le( sc, buf, 64 );
   }
   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
 }
@@ -1117,7 +1787,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   WRITE_STATE32_16WAY(sc);
   sc->ptr = ptr;
 }
-
 static void
 blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
@@ -1152,22 +1821,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm512_or_si512( buf[52>>2],
                                m512_const1_64( 0x0100000001000000ULL ) );
-       buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
-       buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
+       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
+       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+       memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
+       memset_zero_512( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+          buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
+       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
+       blake32_16way( sc, buf, 64 );
+   }
+   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+}
+
+static void
+blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_16WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   // only if calling update with 80
+   if ( len < buf_size - ptr )
+   {
+        memcpy_512( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+   READ_STATE32_16WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
+          COMPRESS32_16WAY_LE( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_16WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   __m512i buf[16];
+   size_t ptr;
+   unsigned bit_len;
+   sph_u32 th, tl;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   buf[ptr>>2] = m512_const1_32( 0x80000000 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
        sc->T0 = 0xFFFFFE00UL;
        sc->T1 = 0xFFFFFFFFUL;
-        memset_zero_512( buf, 56>>2 );
-       if ( out_size_w32 == 8 )
-           buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
-        buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
-        buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
-        blake32_16way( sc, buf, 64 );
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 );
+       buf[56>>2] = _mm512_set1_epi32( th );
+       buf[60>>2] = _mm512_set1_epi32( tl );
+       blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+       memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+       blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
+       memset_zero_512( buf, 56>>2 );
+       buf[52>>2] = m512_one_32;
+       buf[56>>2] = _mm512_set1_epi32( th );
+       buf[60>>2] = _mm512_set1_epi32( tl );
+       blake32_16way_le( sc, buf, 64 );
   }
   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
 }
@@ -1190,6 +1953,18 @@ blake256_16way_close(void *cc, void *dst)
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

+void
+blake256_16way_update_le(void *cc, const void *data, size_t len)
+{
+   blake32_16way_le(cc, data, len);
+}
+
+void
+blake256_16way_close_le(void *cc, void *dst)
+{
+    blake32_16way_close_le(cc, 0, 0, dst, 8);
+}
+
 void blake256r14_16way_init(void *cc)
 {
   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
@@ -1271,6 +2046,18 @@ blake256_8way_close(void *cc, void *dst)
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

+void
+blake256_8way_update_le(void *cc, const void *data, size_t len)
+{
+        blake32_8way_le(cc, data, len);
+}
+
+void
+blake256_8way_close_le(void *cc, void *dst)
+{
+        blake32_8way_close_le(cc, 0, 0, dst, 8);
+}
+
 #endif

 // 14 rounds Blake, Decred
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -361,14 +361,10 @@ static const sph_u64 CB[16] = {
  V9 = m512_const1_64( CB1 );  \
  VA = m512_const1_64( CB2 );  \
  VB = m512_const1_64( CB3 );  \
-  VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
-                         m512_const1_64( CB4 ) );  \
-  VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
-                         m512_const1_64( CB5 ) );  \
-  VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
-                         m512_const1_64( CB6 ) );  \
-  VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
-                         m512_const1_64( CB7 ) );  \
+  VC = _mm512_set1_epi64( T0 ^ CB4 ); \
+  VD = _mm512_set1_epi64( T0 ^ CB5 ); \
+  VE = _mm512_set1_epi64( T1 ^ CB6 ); \
+  VF = _mm512_set1_epi64( T1 ^ CB7 ); \
  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  V9 = m512_const1_64( CB1 );
  VA = m512_const1_64( CB2 );
  VB = m512_const1_64( CB3 );
-  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB4 ) );
-  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB5 ) );
-  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB6 ) );
-  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB7 ) );
+  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
+  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
+  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
+  VF = _mm512_set1_epi64( sc->T1 ^ CB7 );

  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
                                0x28292a2b2c2d2e2f, 0x2021222324252627,
@@ -493,6 +485,308 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
 }

+// won't be used after prehash implemented
+void blake512_8way_compress_le( blake_8way_big_context *sc )
+{
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = m512_const1_64( CB0 );
+  V9 = m512_const1_64( CB1 );
+  VA = m512_const1_64( CB2 );
+  VB = m512_const1_64( CB3 );
+  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
+  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
+  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
+  VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
+
+  M0 = sc->buf[ 0];
+  M1 = sc->buf[ 1];
+  M2 = sc->buf[ 2];
+  M3 = sc->buf[ 3];
+  M4 = sc->buf[ 4];
+  M5 = sc->buf[ 5];
+  M6 = sc->buf[ 6];
+  M7 = sc->buf[ 7];
+  M8 = sc->buf[ 8];
+  M9 = sc->buf[ 9];
+  MA = sc->buf[10];
+  MB = sc->buf[11];
+  MC = sc->buf[12];
+  MD = sc->buf[13];
+  ME = sc->buf[14];
+  MF = sc->buf[15];
+
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+  ROUND_B_8WAY(6);
+  ROUND_B_8WAY(7);
+  ROUND_B_8WAY(8);
+  ROUND_B_8WAY(9);
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+
+  sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
+  sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
+  sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
+  sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
+  sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
+  sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
+  sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
+  sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
+}
+
+// with final_le forms a full hash in 2 parts from little endian data.
+// all variables hard coded for 80 bytes/lane.
+void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
+                               const void *data )
+{
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+
+   // initial hash
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   // fill buffer
+   memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
+   sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
+   sc->buf[11] = 
+   sc->buf[12] = m512_zero;
+   sc->buf[13] = m512_one_64;
+   sc->buf[14] = m512_zero;
+   sc->buf[15] = m512_const1_64( 80*8 );
+
+   // build working variables
+   V0 = sc->H[0];
+   V1 = sc->H[1];
+   V2 = sc->H[2];
+   V3 = sc->H[3];
+   V4 = sc->H[4];
+   V5 = sc->H[5];
+   V6 = sc->H[6];
+   V7 = sc->H[7];
+   V8 = m512_const1_64( CB0 );
+   V9 = m512_const1_64( CB1 );
+   VA = m512_const1_64( CB2 );
+   VB = m512_const1_64( CB3 );
+   VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
+   VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
+   VE = _mm512_set1_epi64( CB6 );
+   VF = _mm512_set1_epi64( CB7 );
+
+   // round 0
+   GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
+   GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
+   GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
+   GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
+
+   // Do half of G4, skip the nonce
+   // GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
+
+   V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
+                       _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); 
+   VA = _mm512_add_epi64( VA, VF ); 
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
+   V0 = _mm512_add_epi64( V0, V5 );
+   
+   GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
+   GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
+   GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
+   
+   // round 1
+   // G1   
+//   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
+   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+   V2 = _mm512_add_epi64( V2, V6 ); 
+
+   // G3
+//   GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
+
+   // save midstate for second part
+   midstate[ 0] = V0;
+   midstate[ 1] = V1;
+   midstate[ 2] = V2;
+   midstate[ 3] = V3;
+   midstate[ 4] = V4;
+   midstate[ 5] = V5;
+   midstate[ 6] = V6;
+   midstate[ 7] = V7;
+   midstate[ 8] = V8;
+   midstate[ 9] = V9;
+   midstate[10] = VA;
+   midstate[11] = VB;
+   midstate[12] = VC;
+   midstate[13] = VD;
+   midstate[14] = VE;
+   midstate[15] = VF;
+} 
+
+// pick up where we left off, need the nonce now.
+void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
+                             const __m512i nonce, const __m512i *midstate )
+{
+   __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m512i h[8] __attribute__ ((aligned (64)));
+
+   // Load data with new nonce
+   M0 = sc->buf[ 0];
+   M1 = sc->buf[ 1];
+   M2 = sc->buf[ 2];
+   M3 = sc->buf[ 3];
+   M4 = sc->buf[ 4];
+   M5 = sc->buf[ 5];
+   M6 = sc->buf[ 6];
+   M7 = sc->buf[ 7];
+   M8 = sc->buf[ 8];
+   M9 = nonce;
+   MA = sc->buf[10];
+   MB = sc->buf[11];
+   MC = sc->buf[12];
+   MD = sc->buf[13];
+   ME = sc->buf[14];
+   MF = sc->buf[15];
+
+   V0 = midstate[ 0];
+   V1 = midstate[ 1];
+   V2 = midstate[ 2];
+   V3 = midstate[ 3];
+   V4 = midstate[ 4];
+   V5 = midstate[ 5];
+   V6 = midstate[ 6];
+   V7 = midstate[ 7];
+   V8 = midstate[ 8];
+   V9 = midstate[ 9];
+   VA = midstate[10];
+   VB = midstate[11];
+   VC = midstate[12];
+   VD = midstate[13];
+   VE = midstate[14];
+   VF = midstate[15];
+
+   // finish round 0 with the nonce now available 
+   V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
+                                       _mm512_set1_epi64( CB8 ), M9 ) ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
+   VA = _mm512_add_epi64( VA, VF );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
+  
+   // Round 1
+   // G0
+   GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+//   GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
+//   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
+
+   V1 = _mm512_add_epi64( V1, V5 );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
+   V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );   
+   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
+   V9 = _mm512_add_epi64( V9, VD );
+   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
+
+   // G2
+//   GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
+//   V2 = _mm512_add_epi64( V2, V6 );
+   V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
+                 _mm512_set1_epi64( CBF ), M9 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
+   V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CB9 ), MF ), V6 ) );
+   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
+   VA = _mm512_add_epi64( VA, VE );
+   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
+
+   // G3
+//   GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
+//   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
+//                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 
+
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
+   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
+                 _mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) ); 
+   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 ); 
+   VB = _mm512_add_epi64( VB, VF ); 
+   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
+
+   // remaining rounds  
+   ROUND_B_8WAY(2);
+   ROUND_B_8WAY(3);
+   ROUND_B_8WAY(4);
+   ROUND_B_8WAY(5);
+   ROUND_B_8WAY(6);
+   ROUND_B_8WAY(7);
+   ROUND_B_8WAY(8);
+   ROUND_B_8WAY(9);
+   ROUND_B_8WAY(0);
+   ROUND_B_8WAY(1);
+   ROUND_B_8WAY(2);
+   ROUND_B_8WAY(3);
+   ROUND_B_8WAY(4);
+   ROUND_B_8WAY(5);
+
+   h[0] = mm512_xor3( V8, V0, sc->H[0] );
+   h[1] = mm512_xor3( V9, V1, sc->H[1] );
+   h[2] = mm512_xor3( VA, V2, sc->H[2] );
+   h[3] = mm512_xor3( VB, V3, sc->H[3] );
+   h[4] = mm512_xor3( VC, V4, sc->H[4] );
+   h[5] = mm512_xor3( VD, V5, sc->H[5] );
+   h[6] = mm512_xor3( VE, V6, sc->H[6] );
+   h[7] = mm512_xor3( VF, V7, sc->H[7] );
+
+   // bswap final hash
+   mm512_block_bswap_64( (__m512i*)hash, h );
+}
+
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -678,6 +972,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
   
+void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
+                        const void *data, size_t len )
+{
+
+// init
+
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
+   sc->ptr = len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+            sc->T1 = sc->T1 + 1;
+      blake512_8way_compress_le( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr64 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
+   }
+   else
+      sc->T0 -= 1024 - bit_len;
+
+   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m512_one_64;
+   sc->buf[14] = m512_const1_64( th );
+   sc->buf[15] = m512_const1_64( tl );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_8way_compress_le( sc );
+
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -741,14 +1102,10 @@ blake512_8way_close(void *cc, void *dst)
  V9 = m256_const1_64( CB1 );  \
  VA = m256_const1_64( CB2 );  \
  VB = m256_const1_64( CB3 );  \
-  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         m256_const1_64( CB4 ) );  \
-  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         m256_const1_64( CB5 ) );  \
-  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         m256_const1_64( CB6 ) );  \
-  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         m256_const1_64( CB7 ) );  \
+  VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
+  VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
+  VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
+  VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -869,6 +1226,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

+void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
+                               const void *data )
+{
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+
+   // initial hash
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+   
+   // fill buffer
+   memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
+   sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
+   sc->buf[11] = m256_zero;
+   sc->buf[12] = m256_zero;
+   sc->buf[13] = m256_one_64;
+   sc->buf[14] = m256_zero;
+   sc->buf[15] = m256_const1_64( 80*8 );
+
+   // build working variables
+   V0 = sc->H[0];
+   V1 = sc->H[1];
+   V2 = sc->H[2];
+   V3 = sc->H[3];
+   V4 = sc->H[4];
+   V5 = sc->H[5];
+   V6 = sc->H[6];
+   V7 = sc->H[7];
+   V8 = m256_const1_64( CB0 );
+   V9 = m256_const1_64( CB1 );
+   VA = m256_const1_64( CB2 );
+   VB = m256_const1_64( CB3 );
+   VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
+   VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
+   VE = _mm256_set1_epi64x( CB6 );
+   VF = _mm256_set1_epi64x( CB7 );
+
+   // round 0
+   GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
+   GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
+   GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
+   GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
+
+   // G4 skip nonce
+   V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
+                       _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
+   VA = _mm256_add_epi64( VA, VF );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
+   V0 = _mm256_add_epi64( V0, V5 );
+
+   GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
+   GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
+   GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
+
+   // round 1
+   // G1   
+   V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
+           sc->buf[ 4] ) );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, V6 );
+
+   // G3
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
+
+   // save midstate for second part
+   midstate[ 0] = V0;
+   midstate[ 1] = V1;
+   midstate[ 2] = V2;
+   midstate[ 3] = V3;
+   midstate[ 4] = V4;
+   midstate[ 5] = V5;
+   midstate[ 6] = V6;
+   midstate[ 7] = V7;
+   midstate[ 8] = V8;
+   midstate[ 9] = V9;
+   midstate[10] = VA;
+   midstate[11] = VB;
+   midstate[12] = VC;
+   midstate[13] = VD;
+   midstate[14] = VE;
+   midstate[15] = VF;
+}
+
+void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
+                             const __m256i nonce, const __m256i *midstate )
+{
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+   __m256i h[8] __attribute__ ((aligned (64)));
+
+   // Load data with new nonce
+   M0 = sc->buf[ 0];
+   M1 = sc->buf[ 1];
+   M2 = sc->buf[ 2];
+   M3 = sc->buf[ 3];
+   M4 = sc->buf[ 4];
+   M5 = sc->buf[ 5];
+   M6 = sc->buf[ 6];
+   M7 = sc->buf[ 7];
+   M8 = sc->buf[ 8];
+   M9 = nonce;
+   MA = sc->buf[10];
+   MB = sc->buf[11];
+   MC = sc->buf[12];
+   MD = sc->buf[13];
+   ME = sc->buf[14];
+   MF = sc->buf[15];
+
+   V0 = midstate[ 0];
+   V1 = midstate[ 1];
+   V2 = midstate[ 2];
+   V3 = midstate[ 3];
+   V4 = midstate[ 4];
+   V5 = midstate[ 5];
+   V6 = midstate[ 6];
+   V7 = midstate[ 7];
+   V8 = midstate[ 8];
+   V9 = midstate[ 9];
+   VA = midstate[10];
+   VB = midstate[11];
+   VC = midstate[12];
+   VD = midstate[13];
+   VE = midstate[14];
+   VF = midstate[15];
+
+   // finish round 0, with the nonce now available 
+   V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
+                                       _mm256_set1_epi64x( CB8 ), M9 ) );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
+   VA = _mm256_add_epi64( VA, VF );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
+
+   // Round 1
+   // G0
+   GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
+
+   // G1
+   V1 = _mm256_add_epi64( V1, V5 );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
+   V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
+   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   V9 = _mm256_add_epi64( V9, VD );
+   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
+
+   // G2
+   V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBF ), M9 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
+   V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
+   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VA = _mm256_add_epi64( VA, VE );
+   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
+
+   // G3
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
+   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
+                 _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
+   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VB = _mm256_add_epi64( VB, VF );
+   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
+
+   // G4, G5, G6, G7
+   GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
+   GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
+   GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
+   GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
+
+   ROUND_B_4WAY(2);
+   ROUND_B_4WAY(3);
+   ROUND_B_4WAY(4);
+   ROUND_B_4WAY(5);
+   ROUND_B_4WAY(6);
+   ROUND_B_4WAY(7);
+   ROUND_B_4WAY(8);
+   ROUND_B_4WAY(9);
+   ROUND_B_4WAY(0);
+   ROUND_B_4WAY(1);
+   ROUND_B_4WAY(2);
+   ROUND_B_4WAY(3);
+   ROUND_B_4WAY(4);
+   ROUND_B_4WAY(5);
+
+   h[0] = mm256_xor3( V8, V0, sc->H[0] );
+   h[1] = mm256_xor3( V9, V1, sc->H[1] );
+   h[2] = mm256_xor3( VA, V2, sc->H[2] );
+   h[3] = mm256_xor3( VB, V3, sc->H[3] );
+   h[4] = mm256_xor3( VC, V4, sc->H[4] );
+   h[5] = mm256_xor3( VD, V5, sc->H[5] );
+   h[6] = mm256_xor3( VE, V6, sc->H[6] );
+   h[7] = mm256_xor3( VF, V7, sc->H[7] );
+
+   // bswap final hash
+   mm256_block_bswap_64( (__m256i*)hash, h );
+}
+
+
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
+
+#if !defined(min)
 #define min(a,b) (a>b ? (b) :(a))
+#endif

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
 		H7 ^= S3 ^ V7 ^ VF; \
 	} while (0)

+#define COMPRESS32_LE   do { \
+      sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+      sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+      sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+      sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+      V0 = H0; \
+      V1 = H1; \
+      V2 = H2; \
+      V3 = H3; \
+      V4 = H4; \
+      V5 = H5; \
+      V6 = H6; \
+      V7 = H7; \
+      V8 = S0 ^ CS0; \
+      V9 = S1 ^ CS1; \
+      VA = S2 ^ CS2; \
+      VB = S3 ^ CS3; \
+      VC = T0 ^ CS4; \
+      VD = T0 ^ CS5; \
+      VE = T1 ^ CS6; \
+      VF = T1 ^ CS7; \
+      M0 = *((uint32_t*)(buf +  0)); \
+      M1 = *((uint32_t*)(buf +  4)); \
+      M2 = *((uint32_t*)(buf +  8)); \
+      M3 = *((uint32_t*)(buf + 12)); \
+      M4 = *((uint32_t*)(buf + 16)); \
+      M5 = *((uint32_t*)(buf + 20)); \
+      M6 = *((uint32_t*)(buf + 24)); \
+      M7 = *((uint32_t*)(buf + 28)); \
+      M8 = *((uint32_t*)(buf + 32)); \
+      M9 = *((uint32_t*)(buf + 36)); \
+      MA = *((uint32_t*)(buf + 40)); \
+      MB = *((uint32_t*)(buf + 44)); \
+      MC = *((uint32_t*)(buf + 48)); \
+      MD = *((uint32_t*)(buf + 52)); \
+      ME = *((uint32_t*)(buf + 56)); \
+      MF = *((uint32_t*)(buf + 60)); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      ROUND_S(4); \
+      ROUND_S(5); \
+      ROUND_S(6); \
+      ROUND_S(7); \
+      if (BLAKE32_ROUNDS == 14) { \
+      ROUND_S(8); \
+      ROUND_S(9); \
+      ROUND_S(0); \
+      ROUND_S(1); \
+      ROUND_S(2); \
+      ROUND_S(3); \
+      } \
+      H0 ^= S0 ^ V0 ^ V8; \
+      H1 ^= S1 ^ V1 ^ V9; \
+      H2 ^= S2 ^ V2 ^ VA; \
+      H3 ^= S3 ^ V3 ^ VB; \
+      H4 ^= S0 ^ V4 ^ VC; \
+      H5 ^= S1 ^ V5 ^ VD; \
+      H6 ^= S2 ^ V6 ^ VE; \
+      H7 ^= S3 ^ V7 ^ VF; \
+   } while (0)
+
 #endif

 #if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
 	sc->ptr = ptr;
 }

+static void
+blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
+{
+   unsigned char *buf;
+   size_t ptr;
+   DECL_STATE32
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if (len < (sizeof sc->buf) - ptr) {
+      memcpy(buf + ptr, data, len);
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE32(sc);
+   while (len > 0) {
+      size_t clen;
+
+      clen = (sizeof sc->buf) - ptr;
+      if (clen > len)
+         clen = len;
+      memcpy(buf + ptr, data, clen);
+      ptr += clen;
+      data = (const unsigned char *)data + clen;
+      len -= clen;
+      if (ptr == sizeof sc->buf) {
+         if ((T0 = SPH_T32(T0 + 512)) < 512)
+            T1 = SPH_T32(T1 + 1);
+         COMPRESS32_LE;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE32(sc);
+   sc->ptr = ptr;
+}
+
 static void
 blake32_close(sph_blake_small_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
 	blake32(cc, data, len);
 }

+void
+sph_blake256_update_le(void *cc, const void *data, size_t len)
+{
+   blake32_le(cc, data, len);
+}
+
 /* see sph_blake.h */
 void
 sph_blake256_close(void *cc, void *dst)
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
 * @param len    the input data length (in bytes)
 */
 void sph_blake256(void *cc, const void *data, size_t len);
+void sph_blake256_update_le(void *cc, const void *data, size_t len);

 /**
 * Terminate the current BLAKE-256 computation and output the result into
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -594,9 +594,6 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
 #define rb6(x)    mm256_rol_64( x, 43 ) 
 #define rb7(x)    mm256_rol_64( x, 53 ) 

-#define rol_off_64( M, j ) \
-   mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
-
 #define add_elt_b( mj0, mj3, mj10, h, K ) \
  _mm256_xor_si256( h, _mm256_add_epi64( K, \
              _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -732,8 +729,23 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 

   __m256i mj[16];
-   for ( i = 0; i < 16; i++ )
-      mj[i] = rol_off_64( M, i );
+
+   mj[ 0] = mm256_rol_64( M[ 0],  1 );
+   mj[ 1] = mm256_rol_64( M[ 1],  2 );
+   mj[ 2] = mm256_rol_64( M[ 2],  3 );
+   mj[ 3] = mm256_rol_64( M[ 3],  4 );
+   mj[ 4] = mm256_rol_64( M[ 4],  5 );
+   mj[ 5] = mm256_rol_64( M[ 5],  6 );
+   mj[ 6] = mm256_rol_64( M[ 6],  7 );
+   mj[ 7] = mm256_rol_64( M[ 7],  8 );
+   mj[ 8] = mm256_rol_64( M[ 8],  9 );
+   mj[ 9] = mm256_rol_64( M[ 9], 10 );
+   mj[10] = mm256_rol_64( M[10], 11 );
+   mj[11] = mm256_rol_64( M[11], 12 );
+   mj[12] = mm256_rol_64( M[12], 13 );
+   mj[13] = mm256_rol_64( M[13], 14 );
+   mj[14] = mm256_rol_64( M[14], 15 );
+   mj[15] = mm256_rol_64( M[15], 16 );

   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
@@ -1034,9 +1046,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define r8b6(x)    mm512_rol_64( x, 43 )
 #define r8b7(x)    mm512_rol_64( x, 53 )

-#define rol8w_off_64( M, j ) \
-   mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
-
 #define add_elt_b8( mj0, mj3, mj10, h, K ) \
  _mm512_xor_si512( h, _mm512_add_epi64( K, \
              _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -1171,41 +1180,73 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

   __m512i mj[16];
-   for ( i = 0; i < 16; i++ )
-      mj[i] = rol8w_off_64( M, i );
+   uint64_t K = 16 * 0x0555555555555555ULL;
+ 
+   mj[ 0] = mm512_rol_64( M[ 0],  1 );
+   mj[ 1] = mm512_rol_64( M[ 1],  2 );
+   mj[ 2] = mm512_rol_64( M[ 2],  3 );
+   mj[ 3] = mm512_rol_64( M[ 3],  4 );
+   mj[ 4] = mm512_rol_64( M[ 4],  5 );
+   mj[ 5] = mm512_rol_64( M[ 5],  6 );
+   mj[ 6] = mm512_rol_64( M[ 6],  7 );
+   mj[ 7] = mm512_rol_64( M[ 7],  8 );
+   mj[ 8] = mm512_rol_64( M[ 8],  9 );
+   mj[ 9] = mm512_rol_64( M[ 9], 10 );
+   mj[10] = mm512_rol_64( M[10], 11 );
+   mj[11] = mm512_rol_64( M[11], 12 );
+   mj[12] = mm512_rol_64( M[12], 13 );
+   mj[13] = mm512_rol_64( M[13], 14 );
+   mj[14] = mm512_rol_64( M[14], 15 );
+   mj[15] = mm512_rol_64( M[15], 16 );

   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
-              (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
-              (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
-              (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
-              (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
-              (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
-              (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
-              (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
-              (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
-              (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
-              (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
-              (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
-              (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
-              (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
-              (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
-              (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+   K += 0x0555555555555555ULL;
   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
-              (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
+

   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
        x7 = _mm512_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x1;
-        x0 = mm512_rol_32( x2, 7 );
-        x1 = mm512_rol_32( x3, 7 );
-        x2 = mm512_rol_32( y0, 7 );
-        x3 = mm512_rol_32( y1, 7 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        x1 = _mm512_xor_si512( x1, x5 );
+        y0 = mm512_rol_32( x2, 7 );
+        y1 = mm512_rol_32( x3, 7 );
+        x2 = mm512_rol_32( x0, 7 );
+        x3 = mm512_rol_32( x1, 7 );
+        x0 = _mm512_xor_si512( y0, x4 );
+        x1 = _mm512_xor_si512( y1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
        x4 = mm512_swap128_64( x4 );
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
        x7 = _mm512_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x2;
-        x0 = mm512_rol_32( x1, 11 );
-        x1 = mm512_rol_32( y0, 11 );
-        x2 = mm512_rol_32( x3, 11 );
-        x3 = mm512_rol_32( y1, 11 );
-        x0 = _mm512_xor_si512( x0, x4 );
+        y0 = mm512_rol_32( x1, 11 );
+        x1 = mm512_rol_32( x0, 11 );
+        y1 = mm512_rol_32( x3, 11 );
+        x3 = mm512_rol_32( x2, 11 );
+        x0 = _mm512_xor_si512( y0, x4 );
        x1 = _mm512_xor_si512( x1, x5 );
-        x2 = _mm512_xor_si512( x2, x6 );
+        x2 = _mm512_xor_si512( y1, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
        x4 = mm512_swap64_32( x4 );
        x5 = mm512_swap64_32( x5 );
@@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp )
    {
        x4 = _mm512_add_epi32( x0, x4 );
        y4 = _mm512_add_epi32( y0, y4 );
-        tx0 = x0;
-        ty0 = y0;
        x5 = _mm512_add_epi32( x1, x5 );
        y5 = _mm512_add_epi32( y1, y5 );
-        tx1 = x1;
-        ty1 = y1;
-        x0 = mm512_rol_32( x2, 7 );
-        y0 = mm512_rol_32( y2, 7 );
+        tx0 = mm512_rol_32( x2, 7 );
+        ty0 = mm512_rol_32( y2, 7 );
+        tx1 = mm512_rol_32( x3, 7 );
+        ty1 = mm512_rol_32( y3, 7 );
        x6 = _mm512_add_epi32( x2, x6 );
-        y6 = _mm512_add_epi32( y2, y6 );
-        x1 = mm512_rol_32( x3, 7 );
-        y1 = mm512_rol_32( y3, 7 );
+        y6 = _mm512_add_epi32( y2, y6 ); 
        x7 = _mm512_add_epi32( x3, x7 );
        y7 = _mm512_add_epi32( y3, y7 );
-
-
-        x2 = mm512_rol_32( tx0, 7 );
-        y2 = mm512_rol_32( ty0, 7 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        y0 = _mm512_xor_si512( y0, y4 );
+        x2 = mm512_rol_32( x0, 7 );
+        y2 = mm512_rol_32( y0, 7 );
+        x3 = mm512_rol_32( x1, 7 );
+        y3 = mm512_rol_32( y1, 7 );
+        x0 = _mm512_xor_si512( tx0, x4 );
+        y0 = _mm512_xor_si512( ty0, y4 );
+        x1 = _mm512_xor_si512( tx1, x5 );
+        y1 = _mm512_xor_si512( ty1, y5 );
        x4 = mm512_swap128_64( x4 );
-        x3 = mm512_rol_32( tx1, 7 );
-        y3 = mm512_rol_32( ty1, 7 );
        y4 = mm512_swap128_64( y4 );
-
-        x1 = _mm512_xor_si512( x1, x5 );
-        y1 = _mm512_xor_si512( y1, y5 );
        x5 = mm512_swap128_64( x5 );
+        y5 = mm512_swap128_64( y5 );
        x2 = _mm512_xor_si512( x2, x6 );
        y2 = _mm512_xor_si512( y2, y6 );
-        y5 = mm512_swap128_64( y5 );
        x3 = _mm512_xor_si512( x3, x7 );
        y3 = _mm512_xor_si512( y3, y7 );
-
        x6 = mm512_swap128_64( x6 );
+        y6 = mm512_swap128_64( y6 );
+        x7 = mm512_swap128_64( x7 );
+        y7 = mm512_swap128_64( y7 );
        x4 = _mm512_add_epi32( x0, x4 );
        y4 = _mm512_add_epi32( y0, y4 );
-        y6 = mm512_swap128_64( y6 );
        x5 = _mm512_add_epi32( x1, x5 );
        y5 = _mm512_add_epi32( y1, y5 );
-        x7 = mm512_swap128_64( x7 );
+        tx0 = mm512_rol_32( x1, 11 );
+        ty0 = mm512_rol_32( y1, 11 );
+        tx1 = mm512_rol_32( x3, 11 );
+        ty1 = mm512_rol_32( y3, 11 );
        x6 = _mm512_add_epi32( x2, x6 );
        y6 = _mm512_add_epi32( y2, y6 );
-        tx0 = x0;
-        ty0 = y0;
-        y7 = mm512_swap128_64( y7 );
-        tx1 = x2;
-        ty1 = y2;
-        x0 = mm512_rol_32( x1, 11 );
-        y0 = mm512_rol_32( y1, 11 );
-
        x7 = _mm512_add_epi32( x3, x7 );
        y7 = _mm512_add_epi32( y3, y7 );
-
-        x1 = mm512_rol_32( tx0, 11 );
-        y1 = mm512_rol_32( ty0, 11 );
-        x0 = _mm512_xor_si512( x0, x4 );
-        x4 = mm512_swap64_32( x4 );
-        y0 = _mm512_xor_si512( y0, y4 );
-        x2 = mm512_rol_32( x3, 11 );
-        y4 = mm512_swap64_32( y4 );
-        y2 = mm512_rol_32( y3, 11 );
+        x1 = mm512_rol_32( x0, 11 );
+        y1 = mm512_rol_32( y0, 11 );
+        x3 = mm512_rol_32( x2, 11 );
+        y3 = mm512_rol_32( y2, 11 );
+        x0 = _mm512_xor_si512( tx0, x4 );
+        y0 = _mm512_xor_si512( ty0, y4 );
        x1 = _mm512_xor_si512( x1, x5 );
-        x5 = mm512_swap64_32( x5 );
        y1 = _mm512_xor_si512( y1, y5 );
-        x3 = mm512_rol_32( tx1, 11 );
+        x4 = mm512_swap64_32( x4 );
+        y4 = mm512_swap64_32( y4 );
+        x5 = mm512_swap64_32( x5 );
        y5 = mm512_swap64_32( y5 );
-        y3 = mm512_rol_32( ty1, 11 );
-
-        x2 = _mm512_xor_si512( x2, x6 );
-        x6 = mm512_swap64_32( x6 );
-        y2 = _mm512_xor_si512( y2, y6 );
-        y6 = mm512_swap64_32( y6 );
+        x2 = _mm512_xor_si512( tx1, x6 );
+        y2 = _mm512_xor_si512( ty1, y6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x7 = mm512_swap64_32( x7 );
        y3 = _mm512_xor_si512( y3, y7 );
-
+        x6 = mm512_swap64_32( x6 );
+        y6 = mm512_swap64_32( y6 );
+        x7 = mm512_swap64_32( x7 );
        y7 = mm512_swap64_32( y7 );
    }

@@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
    h[ 0] = m512_const1_128( iv[0] );
    h[ 1] = m512_const1_128( iv[1] );
    h[ 2] = m512_const1_128( iv[2] );
@@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x1;
-        ROL2( x0, x1, x2, x3, 7 );
-        ROL2( x2, x3, y0, y1, 7 );
-        x0 = _mm256_xor_si256( x0, x4 );
+        ROL2( y0, y1, x2, x3, 7 );
+        ROL2( x2, x3, x0, x1, 7 );
+        x0 = _mm256_xor_si256( y0, x4 );
+        x1 = _mm256_xor_si256( y1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
        x4 = mm256_swap128_64( x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
        x5 = mm256_swap128_64( x5 );
-        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = _mm256_add_epi32( x0, x4 );
        x6 = mm256_swap128_64( x6 );
-        y0 = x0;
-        x5 = _mm256_add_epi32( x1, x5 );
        x7 = mm256_swap128_64( x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
-        y1 = x2;
-        ROL2( x0, x1, x1, y0, 11 );
        x7 = _mm256_add_epi32( x3, x7 );
-        ROL2( x2, x3, x3, y1, 11 );
-        x0 = _mm256_xor_si256( x0, x4 );
-        x4 = mm256_swap64_32( x4 );
+        ROL2( y0, x1, x1, x0, 11 );
+        ROL2( y1, x3, x3, x2, 11 );
+        x0 = _mm256_xor_si256( y0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
-        x5 = mm256_swap64_32( x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x6 = mm256_swap64_32( x6 );
+        x2 = _mm256_xor_si256( y1, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
        x7 = mm256_swap64_32( x7 );
    }

@@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
    h[ 0] = m256_const1_128( iv[0] );
    h[ 1] = m256_const1_128( iv[1] );
    h[ 2] = m256_const1_128( iv[2] );
@@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
    return 0;
 }

-
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
 {
    const int len = size >> 4;
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -15,11 +15,11 @@

 struct _cubehashParam
 {
+    __m128i _ALIGN(64) x[8];  // aligned for __m512i
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(64) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = {
 #define sE   c7
 #define sF   m7

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 // Hamsi 8 way AVX512 

-// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
-// produces the same result but is faster.
+// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
+// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
+// on i9-9940x cmplt with zero was 3% faster than movepi. 
+
 #define INPUT_BIG8 \
 do { \
  __m512i db = _mm512_ror_epi64( *buf, 1 ); \
+  const __m512i zero = m512_zero; \
  const uint64_t *tp = (const uint64_t*)T512; \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
  for ( int u = 0; u < 64; u++ ) \
  { \
-     __mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
+     const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
     m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
     m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
     m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
@@ -573,29 +575,6 @@ do { \
  } \
 } while (0)

-/*
-#define INPUT_BIG8 \
-do { \
-  __m512i db = *buf; \
-  const uint64_t *tp = (const uint64_t*)T512;  \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
-  for ( int u = 0; u < 64; u++ ) \
-  { \
-     __m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
-     m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
-     m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
-     m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
-     m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
-     m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
-     m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
-     m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
-     m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
-     tp += 8; \
-     db = _mm512_srli_epi64( db, 1 ); \
-  } \
-} while (0)
-*/
-
 #define SBOX8( a, b, c, d ) \
 do { \
  __m512i t; \
@@ -632,199 +611,192 @@ do { \

 #define READ_STATE_BIG8(sc) \
 do { \
-   c0 = sc->h[0x0]; \
-   c1 = sc->h[0x1]; \
-   c2 = sc->h[0x2]; \
-   c3 = sc->h[0x3]; \
-   c4 = sc->h[0x4]; \
-   c5 = sc->h[0x5]; \
-   c6 = sc->h[0x6]; \
-   c7 = sc->h[0x7]; \
+   c0 = sc->h[0]; \
+   c1 = sc->h[1]; \
+   c2 = sc->h[2]; \
+   c3 = sc->h[3]; \
+   c4 = sc->h[4]; \
+   c5 = sc->h[5]; \
+   c6 = sc->h[6]; \
+   c7 = sc->h[7]; \
 } while (0)

 #define WRITE_STATE_BIG8(sc) \
 do { \
-   sc->h[0x0] = c0; \
-   sc->h[0x1] = c1; \
-   sc->h[0x2] = c2; \
-   sc->h[0x3] = c3; \
-   sc->h[0x4] = c4; \
-   sc->h[0x5] = c5; \
-   sc->h[0x6] = c6; \
-   sc->h[0x7] = c7; \
+   sc->h[0] = c0; \
+   sc->h[1] = c1; \
+   sc->h[2] = c2; \
+   sc->h[3] = c3; \
+   sc->h[4] = c4; \
+   sc->h[5] = c5; \
+   sc->h[6] = c6; \
+   sc->h[7] = c7; \
 } while (0)

-
 #define ROUND_BIG8( alpha ) \
 do { \
   __m512i t0, t1, t2, t3; \
-   s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
-   s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
-   s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
-   s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
-   s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
-   s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
-   s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
-   s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
-   s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
-   s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
-   sA = _mm512_xor_si512( sA, alpha[10] ); \
-   sB = _mm512_xor_si512( sB, alpha[11] ); \
-   sC = _mm512_xor_si512( sC, alpha[12] ); \
-   sD = _mm512_xor_si512( sD, alpha[13] ); \
-   sE = _mm512_xor_si512( sE, alpha[14] ); \
-   sF = _mm512_xor_si512( sF, alpha[15] ); \
+   s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
+   s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
+   s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
+   s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
+   s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
+   s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
+   s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
+   s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
+   s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
+   s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
+   sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
+   sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
+   sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
+   sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
+   sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
+   sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
 \
-  SBOX8( s0, s4, s8, sC ); \
-  SBOX8( s1, s5, s9, sD ); \
-  SBOX8( s2, s6, sA, sE ); \
-  SBOX8( s3, s7, sB, sF ); \
+  SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
+  SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
+  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
+  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
-                                        _mm512_bslli_epi128( s5, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
-                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  s4 = mm512_swap64_32( s4 ); \
+  s5 = mm512_swap64_32( s5 ); \
+  sD = mm512_swap64_32( sD ); \
+  sE = mm512_swap64_32( sE ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
  L8( s0, t1, s9, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
-  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
-                                        _mm512_bslli_epi128( s6, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
-                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  s6 = mm512_swap64_32( s6 ); \
+  sF = mm512_swap64_32( sF ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
  L8( s1, t1, sA, t3 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
-  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
-  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
-                                        _mm512_bslli_epi128( s7, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
-                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  s7 = mm512_swap64_32( s7 ); \
+  sC = mm512_swap64_32( sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
  L8( s2, t1, sB, t3 ); \
-  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
-  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
-  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
+  s6 = mm512_swap64_32( s6 ); \
+  sF = mm512_swap64_32( sF ); \
 \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
-                                        _mm512_bslli_epi128( s4, 4 ) ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
-                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
  L8( s3, t1, s8, t3 ); \
-  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
-  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
-  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
+  s7 = mm512_swap64_32( s7 ); \
+  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
-                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
+  t3 = mm512_swap64_32( t3 ); \
  L8( t0, t1, t2, t3 ); \
+  t3 = mm512_swap64_32( t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
-                                        _mm512_bslli_epi128( sD, 4 ) ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
  L8( t0, t1, t2, t3 ); \
-  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
-  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
-  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
-  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+  s4 = mm512_swap64_32( s4 ); \
+  s5 = mm512_swap64_32( s5 ); \
+  sD = mm512_swap64_32( sD ); \
+  sE = mm512_swap64_32( sE ); \
 } while (0)

 #define P_BIG8 \
 do { \
   __m512i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
   for( int i = 0; i < 16; i++ ) \
      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
 } while (0)

 #define PF_BIG8 \
 do { \
   __m512i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
   for( int i = 0; i < 16; i++ ) \
      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
 } while (0)

 #define T_BIG8 \
 do { /* order is important */ \
-   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
-   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
-   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
-   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
-   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
-   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
-   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
-   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+   c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
+   c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
+   c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
+   c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
+   c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
+   c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
+   c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
+   c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
 } while (0)

 void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
   WRITE_STATE_BIG8( sc );
 }

-
 void hamsi512_8way_init( hamsi_8way_big_context *sc )
 {
   sc->partial_len = 0;
@@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 #define INPUT_BIG \
 do { \
  __m256i db = *buf; \
+  const __m256i zero = m256_zero; \
  const uint64_t *tp = (const uint64_t*)T512;  \
-  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
-  for ( int u = 0; u < 64; u++ ) \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  for ( int u = 63; u >= 0; u-- ) \
  { \
-     __m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
+     __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
                                          m256_const1_64( tp[0] ) ) ); \
     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -933,7 +905,6 @@ do { \
     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
                                          m256_const1_64( tp[7] ) ) ); \
     tp += 8; \
-     db = _mm256_srli_epi64( db, 1 ); \
  } \
 } while (0)

@@ -982,47 +953,28 @@ do { \

 #define READ_STATE_BIG(sc) \
 do { \
-   c0 = sc->h[0x0]; \
-   c1 = sc->h[0x1]; \
-   c2 = sc->h[0x2]; \
-   c3 = sc->h[0x3]; \
-   c4 = sc->h[0x4]; \
-   c5 = sc->h[0x5]; \
-   c6 = sc->h[0x6]; \
-   c7 = sc->h[0x7]; \
+   c0 = sc->h[0]; \
+   c1 = sc->h[1]; \
+   c2 = sc->h[2]; \
+   c3 = sc->h[3]; \
+   c4 = sc->h[4]; \
+   c5 = sc->h[5]; \
+   c6 = sc->h[6]; \
+   c7 = sc->h[7]; \
 } while (0)

 #define WRITE_STATE_BIG(sc) \
 do { \
-   sc->h[0x0] = c0; \
-   sc->h[0x1] = c1; \
-   sc->h[0x2] = c2; \
-   sc->h[0x3] = c3; \
-   sc->h[0x4] = c4; \
-   sc->h[0x5] = c5; \
-   sc->h[0x6] = c6; \
-   sc->h[0x7] = c7; \
+   sc->h[0] = c0; \
+   sc->h[1] = c1; \
+   sc->h[2] = c2; \
+   sc->h[3] = c3; \
+   sc->h[4] = c4; \
+   sc->h[5] = c5; \
+   sc->h[6] = c6; \
+   sc->h[7] = c7; \
 } while (0)

-/*
-#define s0   m0
-#define s1   c0
-#define s2   m1
-#define s3   c1
-#define s4   c2
-#define s5   m2
-#define s6   c3
-#define s7   m3
-#define s8   m4
-#define s9   c4
-#define sA   m5
-#define sB   c5
-#define sC   c6
-#define sD   m6
-#define sE   c7
-#define sF   m7
-*/
-
 #define ROUND_BIG( alpha ) \
 do { \
   __m256i t0, t1, t2, t3; \
@@ -1048,151 +1000,145 @@ do { \
  SBOX( s2, s6, sA, sE ); \
  SBOX( s3, s7, sB, sF ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
-                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
-                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  s4 = mm256_swap64_32( s4 ); \
+  s5 = mm256_swap64_32( s5 ); \
+  sD = mm256_swap64_32( sD ); \
+  sE = mm256_swap64_32( sE ); \
+  t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
+  t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
  L( s0, t1, s9, t3 ); \
-  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
+  s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
+  sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
-                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
-                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  s6 = mm256_swap64_32( s6 ); \
+  sF = mm256_swap64_32( sF ); \
+  t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
+  t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
  L( s1, t1, sA, t3 ); \
-  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
+  sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
-                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
-                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  s7 = mm256_swap64_32( s7 ); \
+  sC = mm256_swap64_32( sC ); \
+  t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
+  t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
  L( s2, t1, sB, t3 ); \
-  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
+  sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
+  sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
+  s6 = mm256_swap64_32( s6 ); \
+  sF = mm256_swap64_32( sF ); \
 \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
-                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
-                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
+  t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
  L( s3, t1, s8, t3 ); \
-  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
-  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
-  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
-  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+  s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
+  s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
+  sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
+  sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
+  s7 = mm256_swap64_32( s7 ); \
+  sC = mm256_swap64_32( sC ); \
 \
-  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
-  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
-  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
-  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
-                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
+  t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
+  t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
+  t3 = mm256_swap64_32( t3 ); \
  L( t0, t1, t2, t3 ); \
+  t3 = mm256_swap64_32( t3 ); \
  s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
-  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
  s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
-  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
-  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
-  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
-  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
-  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
+  s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
+  s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
+  sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
 \
-  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
-  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
-                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
-  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
-  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
+  t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
+  t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
  L( t0, t1, t2, t3 ); \
-  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
-  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
-  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
-  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
+  s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
+  sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
  s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
-  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
  s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
-  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
+  s4 = mm256_swap64_32( s4 ); \
+  s5 = mm256_swap64_32( s5 ); \
+  sD = mm256_swap64_32( sD ); \
+  sE = mm256_swap64_32( sE ); \
 } while (0)

 #define P_BIG \
 do { \
   __m256i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
   for( int i = 0; i < 16; i++ ) \
      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
 } while (0)

 #define PF_BIG \
 do { \
   __m256i alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
   for( int i = 0; i < 16; i++ ) \
      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
-                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
 } while (0)

 #define T_BIG \
 do { /* order is important */ \
-   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
-   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
-   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
-   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
-   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
-   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
-   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
-   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
+   c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
+   c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
+   c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
+   c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
+   c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
+   c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
+   c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
+   c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
 } while (0)

 void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
        uint64_t *data[SHA512_PARALLEL_N],
        uint64_t *digest[SHA512_PARALLEL_N]);

-void sha512ProcessBlock(Sha512Context *context);
+void sha512ProcessBlock(Sha512Context contexti[2] );

 #endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
 #define WRITE_STATE(sc)

 #define MOV64(d, s)      (d = s)
-#define XOR64_IOTA       XOR64
+#define XOR64_IOTA       XOR
+

 #define LPAR   (
 #define RPAR   )
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
 // Targetted macros, keccak-macros.h is included for each target.

 #define DECL64(x)          __m512i x
-#define XOR64(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define XOR(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define XOR64 XOR
 #define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
 #define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
 #define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
 #define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
 #define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
-
+#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))

 #include "keccak-macros.c"

@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
+#undef XOR
 #undef AND64
 #undef OR64
 #undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
-
+#undef XOR3
 #endif  // AVX512

 // AVX2
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
 } while (0)

 #define DECL64(x)        __m256i x
-#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
+#define XOR(d, a, b)    (d = _mm256_xor_si256(a,b))
+#define XOR64 XOR
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rol_64(v, n))
 #define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
 #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
+#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

 #include "keccak-macros.c"

@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
+#undef XOR
 #undef AND64
 #undef OR64
 #undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
 #undef KECCAK_F_1600
 #undef XOROR
 #undef XORAND
+#undef XOR3

 #endif  // AVX2
--- a/algo/keccak/keccak-macros.c
+++ b/algo/keccak/keccak-macros.c
@@ -1,6 +1,19 @@
 #ifdef TH_ELT
 #undef TH_ELT
 #endif
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+    DECL64(tt0); \
+    DECL64(tt1); \
+    XOR3( tt0, d0, d1, d4 ); \
+    XOR( tt1, d2, d3 ); \
+    XOR( tt0, tt0, tt1 ); \
+    ROL64( tt0, tt0, 1 ); \
+    XOR3( tt1, c0, c1, c4 ); \
+    XOR3( tt0, tt0, c2, c3 ); \
+    XOR( t, tt0, tt1 ); \
+} while (0)
+/*
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
                DECL64(tt0); \
                DECL64(tt1); \
@@ -17,7 +30,7 @@
                XOR64(tt2, tt2, tt3); \
                XOR64(t, tt0, tt2); \
        } while (0)
-
+*/
 #ifdef THETA
 #undef THETA
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -13,8 +13,7 @@

 #if defined (ALLIUM_16WAY)  

-typedef struct {
-   blake256_16way_context     blake;
+typedef union {
   keccak256_8way_context    keccak;
   cube_4way_2buf_context    cube;
   skein256_8way_context     skein;
@@ -25,41 +24,31 @@ typedef struct {
 #endif
 } allium_16way_ctx_holder;

-static __thread allium_16way_ctx_holder allium_16way_ctx;
-
-bool init_allium_16way_ctx()
-{
-   keccak256_8way_init( &allium_16way_ctx.keccak );
-   skein256_8way_init( &allium_16way_ctx.skein );
-   return true;
-}
-
-void allium_16way_hash( void *state, const void *input )
+static void allium_16way_hash( void *state, const void *midstate_vars, 
+                               const void *midhash, const void *block )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   uint32_t hash8[8] __attribute__ ((aligned (64)));
-   uint32_t hash9[8] __attribute__ ((aligned (64)));
-   uint32_t hash10[8] __attribute__ ((aligned (64)));
-   uint32_t hash11[8] __attribute__ ((aligned (64)));
-   uint32_t hash12[8] __attribute__ ((aligned (64)));
-   uint32_t hash13[8] __attribute__ ((aligned (64)));
-   uint32_t hash14[8] __attribute__ ((aligned (64)));
-   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (32)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   uint32_t hash8[8] __attribute__ ((aligned (32)));
+   uint32_t hash9[8] __attribute__ ((aligned (32)));
+   uint32_t hash10[8] __attribute__ ((aligned (32)));
+   uint32_t hash11[8] __attribute__ ((aligned (32)));
+   uint32_t hash12[8] __attribute__ ((aligned (32)));
+   uint32_t hash13[8] __attribute__ ((aligned (32)));
+   uint32_t hash14[8] __attribute__ ((aligned (32)));
+   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -69,7 +58,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
-//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   keccak256_8way_init( &ctx.keccak );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
   keccak256_8way_close( &ctx.keccak, vhashA);
   keccak256_8way_init( &ctx.keccak );
@@ -152,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

+   skein256_8way_init( &ctx.skein );
   skein256_8way_update( &ctx.skein, vhashA, 32 );
   skein256_8way_close( &ctx.skein, vhashA );
   skein256_8way_init( &ctx.skein );
@@ -199,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
   groestl256_full( &ctx.groestl, state+416, hash13, 256 );
   groestl256_full( &ctx.groestl, state+448, hash14, 256 );
   groestl256_full( &ctx.groestl, state+480, hash15, 256 );
+
 #endif
 }

@@ -206,35 +197,72 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) = 
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m512i sixteen = m512_const1_32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   // Prehash first block.
+   blake256_transform_le( phash, pdata, 512, 0 );

-   blake256_16way_init( &allium_16way_ctx.blake );
-   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
+   // Interleave hash for second block prehash.
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces, add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] = 
+   block_buf[ 7] = 
+   block_buf[ 8] = 
+   block_buf[ 9] = 
+   block_buf[10] = 
+   block_buf[11] = 
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_16way_hash( hash, vdata );
+     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 16; lane++ ) 
     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
     {
-         pdata[19] = bswap_32( n + lane );
-         submit_solution( work, hash+(lane<<3), mythr );
+        pdata[19] = n + lane;
+        submit_solution( work, hash+(lane<<3), mythr );
     }
-     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen ); 
     n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
@@ -244,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,

 #elif defined (ALLIUM_8WAY)  

-typedef struct {
-   blake256_8way_context     blake;
+typedef union {
   keccak256_4way_context    keccak;
   cube_2way_context         cube;
   skein256_4way_context     skein;
@@ -256,19 +283,11 @@ typedef struct {
 #endif
 } allium_8way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
-
-bool init_allium_8way_ctx()
-{
-   keccak256_4way_init( &allium_8way_ctx.keccak );
-   skein256_4way_init( &allium_8way_ctx.skein );
-   return true;
-}
-
-void allium_8way_hash( void *hash, const void *input )
+static void allium_8way_hash( void *hash, const void *midstate_vars,
+                               const void *midhash, const void *block )
 {
   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
   uint64_t *hash0 = (uint64_t*)hash;
   uint64_t *hash1 = (uint64_t*)hash+ 4;
   uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -279,15 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhashA );
+   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                     vhashA, 256 );
+                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   keccak256_4way_init( &ctx.keccak );
   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
   keccak256_4way_close( &ctx.keccak, vhashA );
   keccak256_4way_init( &ctx.keccak );
@@ -306,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

-
   intrlv_2x128( vhashA, hash0, hash1, 256 );
   intrlv_2x128( vhashB, hash2, hash3, 256 );
   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
@@ -333,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

+   skein256_4way_init( &ctx.skein );
   skein256_4way_update( &ctx.skein, vhashA, 32 );
   skein256_4way_close( &ctx.skein, vhashA );
   skein256_4way_init( &ctx.skein );
@@ -341,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )

 #if defined(__VAES__)

-   uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
-   uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
+   uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
+   uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
   
   rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -377,36 +395,72 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;  
   const bool bench = opt_benchmark;
+   const __m256i eight = m256_const1_32( 8 );

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   blake256_8way_init( &allium_8way_ctx.blake );
-   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-     allium_8way_hash( hash, vdata );
+     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );

     for ( int lane = 0; lane < 8; lane++ )
     {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = bswap_32( n + lane );
+           pdata[19] = n + lane;
           submit_solution( work, lane_hash, mythr );
        }
     }
     n += 8;
-     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
-  gate->hash       = (void*)&lyra2z_16way_hash;
+//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
-  gate->hash       = (void*)&lyra2z_8way_hash;
+//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
-  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_16way;
-  gate->hash      = (void*)&allium_16way_hash;
 #elif defined (ALLIUM_8WAY)
-  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
-  gate->hash      = (void*)&allium_8way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();

 #if defined(LYRA2Z_16WAY)

-void lyra2z_16way_hash( void *state, const void *input );
+//void lyra2z_16way_hash( void *state, const void *input );
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_16way_thread_init();

 #elif defined(LYRA2Z_8WAY)

-void lyra2z_8way_hash( void *state, const void *input );
+//void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );

 #if defined(ALLIUM_16WAY)

-void allium_16way_hash( void *state, const void *input );
 int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_16way_ctx();

 #elif defined(ALLIUM_8WAY)

-void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_8way_ctx();

 #else

--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -14,42 +14,32 @@ bool lyra2z_16way_thread_init()
 return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_16way_context l2z_16way_blake_mid;
-
-void lyra2z_16way_midstate( const void* input )
-{
-       blake256_16way_init( &l2z_16way_blake_mid );
-       blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
-}
-
-void lyra2z_16way_hash( void *state, const void *input )
+static void lyra2z_16way_hash( void *state, const void *midstate_vars,
+                        const void *midhash, const void *block )
 {
    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
-    uint32_t hash0[8] __attribute__ ((aligned (64)));
-    uint32_t hash1[8] __attribute__ ((aligned (64)));
-    uint32_t hash2[8] __attribute__ ((aligned (64)));
-    uint32_t hash3[8] __attribute__ ((aligned (64)));
-    uint32_t hash4[8] __attribute__ ((aligned (64)));
-    uint32_t hash5[8] __attribute__ ((aligned (64)));
-    uint32_t hash6[8] __attribute__ ((aligned (64)));
-    uint32_t hash7[8] __attribute__ ((aligned (64)));
-    uint32_t hash8[8] __attribute__ ((aligned (64)));
-    uint32_t hash9[8] __attribute__ ((aligned (64)));
-    uint32_t hash10[8] __attribute__ ((aligned (64)));
-    uint32_t hash11[8] __attribute__ ((aligned (64)));
-    uint32_t hash12[8] __attribute__ ((aligned (64)));
-    uint32_t hash13[8] __attribute__ ((aligned (64)));
-    uint32_t hash14[8] __attribute__ ((aligned (64)));
-    uint32_t hash15[8] __attribute__ ((aligned (64)));
-    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+    uint32_t hash0[8] __attribute__ ((aligned (32)));
+    uint32_t hash1[8] __attribute__ ((aligned (32)));
+    uint32_t hash2[8] __attribute__ ((aligned (32)));
+    uint32_t hash3[8] __attribute__ ((aligned (32)));
+    uint32_t hash4[8] __attribute__ ((aligned (32)));
+    uint32_t hash5[8] __attribute__ ((aligned (32)));
+    uint32_t hash6[8] __attribute__ ((aligned (32)));
+    uint32_t hash7[8] __attribute__ ((aligned (32)));
+    uint32_t hash8[8] __attribute__ ((aligned (32)));
+    uint32_t hash9[8] __attribute__ ((aligned (32)));
+    uint32_t hash10[8] __attribute__ ((aligned (32)));
+    uint32_t hash11[8] __attribute__ ((aligned (32)));
+    uint32_t hash12[8] __attribute__ ((aligned (32)));
+    uint32_t hash13[8] __attribute__ ((aligned (32)));
+    uint32_t hash14[8] __attribute__ ((aligned (32)));
+    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
-    blake256_16way_update( &ctx_blake, input + (64*16), 16 );
-    blake256_16way_close( &ctx_blake, vhash );
+    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
-               vhash, 256 );
+              vhash, 256 );

    intrlv_2x256( vhash, hash0, hash1, 256 );
    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
@@ -97,40 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m512i sixteen = m512_const1_32( 16 );

-   if ( bench )   ptarget[7] = 0x0000ff;
+   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );
+
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-   lyra2z_16way_midstate( vdata );
+   block_buf[ 4] = m512_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m512_zero;
+   block_buf[13] = m512_one_32;
+   block_buf[14] = m512_zero;
+   block_buf[15] = m512_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_16way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_solution( work, lane_hash, mythr );
-        }
-      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
-      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );

+     for ( int lane = 0; lane < 16; lane++ )
+     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
+     {
+        pdata[19] = n + lane;
+        submit_solution( work, hash+(lane<<3), mythr );
+     }
+     block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
+     n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
@@ -145,30 +169,20 @@ bool lyra2z_8way_thread_init()
 return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_8way_context l2z_8way_blake_mid;
-
-void lyra2z_8way_midstate( const void* input )
-{
-       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
+static void lyra2z_8way_hash( void *state, const void *midstate_vars,
+                       const void *midhash, const void *block )
 {
     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     uint32_t hash4[8] __attribute__ ((aligned (32)));
+     uint32_t hash5[8] __attribute__ ((aligned (32)));
+     uint32_t hash6[8] __attribute__ ((aligned (32)));
+     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));

-     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
-     blake256_8way_close( &ctx_blake, vhash );
+     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -182,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );

-
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -197,43 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const __m256i eight = m256_const1_32( 8 );

-   if ( bench )  ptarget[7] = 0x0000ff;
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0 );

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   lyra2z_8way_midstate( vdata );
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces and add padding.
+   block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[ 3] =
+            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   block_buf[ 4] = m256_const1_32( 0x80000000 );
+   block_buf[ 5] =
+   block_buf[ 6] =
+   block_buf[ 7] =
+   block_buf[ 8] =
+   block_buf[ 9] =
+   block_buf[10] =
+   block_buf[11] =
+   block_buf[12] = m256_zero;
+   block_buf[13] = m256_one_32;
+   block_buf[14] = m256_zero;
+   block_buf[15] = m256_const1_32( 80*8 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_8way_hash( hash, vdata );
+     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );

-      for ( int lane = 0; lane < 8; lane++ )
-      {
+     for ( int lane = 0; lane < 8; lane++ )
+     {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = bswap_32( n + lane );
+           pdata[19] = n + lane;
           submit_solution( work, lane_hash, mythr );
        }
-      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+     }
+     n += 8;
+     block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

-
 #elif defined(LYRA2Z_4WAY)


--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
 // overlap it's unified.
 // As a result normal is Nrows-2 / Nrows.
 // for 4 rows: 1 unified, 2 overlap, 1 normal.
-// for 8 rows: 1 unified, 2 overlap, 56 normal.
+// for 8 rows: 1 unified, 2 overlap, 5 normal.

 static inline void reducedDuplexRow_2way_normal( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
+     io0 = _mm512_load_si512( inout0    );
+     io1 = _mm512_load_si512( inout0 +1 );
+     io2 = _mm512_load_si512( inout0 +2 );
+
+     io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1    );
+     io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
+     io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
+
+/*
     io0 = _mm512_mask_blend_epi64( 0xf0,
                                    _mm512_load_si512( (__m512i*)inout0 ),
                                    _mm512_load_si512( (__m512i*)inout1 ) );
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
     io2 = _mm512_mask_blend_epi64( 0xf0,
                                    _mm512_load_si512( (__m512i*)inout0 +2 ),
                                    _mm512_load_si512( (__m512i*)inout1 +2 ) );
+*/

     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
+     io0.v512 = _mm512_load_si512( inout0    );
+     io1.v512 = _mm512_load_si512( inout0 +1 );
+     io2.v512 = _mm512_load_si512( inout0 +2 );
+
+     io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1    );
+     io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
+     io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
+
+/*
     io0.v512 = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 ),
                                  _mm512_load_si512( (__m512i*)inout1 ) );
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
     io2.v512 = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+*/

     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
     
-/* 
-     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 ),
-                                  _mm512_load_si512( (__m512i*)inout1 ) );
-     io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
-                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
-     io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
-                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
-
-     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
-     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
-     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
-*/
-
     //Applies the reduced-round transformation f to the sponge's state
     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );

@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
          io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
       }

-/*
-       if ( rowOut == rowInOut0 )
-       {
-          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
-          io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
-          io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
-
-       }
-       if ( rowOut == rowInOut1 )
-       {
-          io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
-          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
-          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
-       }
-*/
-
       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
     }

+/*     
+      casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
+      casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
+      casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
+     _mm512_mask_store_epi64( inout1,    0xf0, io0.v512 );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
+*/
+
+      
      casti_m256i( inout0, 0 ) = io0.v256lo;
      casti_m256i( inout1, 1 ) = io0.v256hi;
      casti_m256i( inout0, 2 ) = io1.v256lo;
      casti_m256i( inout1, 3 ) = io1.v256hi;
      casti_m256i( inout0, 4 ) = io2.v256lo;
      casti_m256i( inout1, 5 ) = io2.v256hi;
+
 /*     
     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
   uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash4 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash5 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash6 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash7 [16]    __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash4 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash5 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash6 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash7 [16]    __attribute__ ((aligned (32)));
   hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
   __mmask8 vh_mask;
   const __m512i vmask = m512_const1_64( 24 );
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;

 extern void hmq1725_4way_hash(void *state, const void *input)
 {
-   uint32_t hash0 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash1 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash2 [16]    __attribute__ ((aligned (64)));
-   uint32_t hash3 [16]    __attribute__ ((aligned (64)));
   uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
   uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
+   uint32_t hash0 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash1 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash2 [16]    __attribute__ ((aligned (32)));
+   uint32_t hash3 [16]    __attribute__ ((aligned (32)));
   hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
   __m256i vh_mask;     
   int h_mask;
--- a/algo/ripemd/sph_ripemd.c
+++ b/algo/ripemd/sph_ripemd.c
@@ -35,6 +35,7 @@

 #include "sph_ripemd.h"

+#if 0
 /*
 * Round functions for RIPEMD (original).
 */
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
 	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
 };
+#endif

 /*
 * Round functions for RIPEMD-128 and RIPEMD-160.
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {

 #define ROTL    SPH_ROTL32

+#if 0
+
 /* ===================================================================== */
 /*
 * RIPEMD (original hash, deprecated).
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
 #undef RIPEMD128_IN
 }

+#endif
+
 /* ===================================================================== */
 /*
 * RIPEMD-160.
--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -84,6 +84,7 @@
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+#if 0
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
 */
 void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);

+#endif
+
 /* ===================================================================== */

 /**
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -34,6 +34,7 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/sha/sha256-hash.h"
 #include <mm_malloc.h>
+#include "malloc-huge.h"

 static const uint32_t keypad[12] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,

 bool scrypt_miner_thread_init( int thr_id )
 {
-   scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   scratchbuf = malloc_hugepages( scratchbuf_size );
   if ( scratchbuf )
-      return true;
+   {
+      if ( opt_debug )
+         applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
+   }
+   else
+       scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   
+   if ( scratchbuf ) return true;
+   
   applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
-   return false; 
+   return false;
 }

 bool register_scrypt_algo( algo_gate_t* gate )
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -62,8 +62,8 @@ extern "C"{
 #if defined(__AVX2__)

 #define DECL_STATE8   \
-   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
-           A08, A09, A0A, A0B; \
+   __m256i A0, A1, A2, A3, A4, A5, A6, A7, \
+           A8, A9, AA, AB; \
   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
           B8, B9, BA, BB, BC, BD, BE, BF; \
   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -78,18 +78,18 @@ extern "C"{
 { \
   if ( (state)->state_loaded ) \
   { \
-      A00 = (state)->A[0]; \
-      A01 = (state)->A[1]; \
-      A02 = (state)->A[2]; \
-      A03 = (state)->A[3]; \
-      A04 = (state)->A[4]; \
-      A05 = (state)->A[5]; \
-      A06 = (state)->A[6]; \
-      A07 = (state)->A[7]; \
-      A08 = (state)->A[8]; \
-      A09 = (state)->A[9]; \
-      A0A = (state)->A[10]; \
-      A0B = (state)->A[11]; \
+      A0 = (state)->A[0]; \
+      A1 = (state)->A[1]; \
+      A2 = (state)->A[2]; \
+      A3 = (state)->A[3]; \
+      A4 = (state)->A[4]; \
+      A5 = (state)->A[5]; \
+      A6 = (state)->A[6]; \
+      A7 = (state)->A[7]; \
+      A8 = (state)->A[8]; \
+      A9 = (state)->A[9]; \
+      AA = (state)->A[10]; \
+      AB = (state)->A[11]; \
      B0 = (state)->B[0]; \
      B1 = (state)->B[1]; \
      B2 = (state)->B[2]; \
@@ -126,18 +126,18 @@ extern "C"{
   else \
   { \
       (state)->state_loaded = true; \
-       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
-       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
-       A02 = m256_const1_64( 0xE782B699E782B699 ); \
-       A03 = m256_const1_64( 0x5530463255304632 ); \
-       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
-       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
-       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
-       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
-       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       A0 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A2 = m256_const1_64( 0xE782B699E782B699 ); \
+       A3 = m256_const1_64( 0x5530463255304632 ); \
+       A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A9 = m256_const1_64( 0x8BD144108BD14410 ); \
+       AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -176,18 +176,18 @@ extern "C"{
 } while (0)

 #define WRITE_STATE8(state)   do { \
-      (state)->A[0] = A00; \
-      (state)->A[1] = A01; \
-      (state)->A[2] = A02; \
-      (state)->A[3] = A03; \
-      (state)->A[4] = A04; \
-      (state)->A[5] = A05; \
-      (state)->A[6] = A06; \
-      (state)->A[7] = A07; \
-      (state)->A[8] = A08; \
-      (state)->A[9] = A09; \
-      (state)->A[10] = A0A; \
-      (state)->A[11] = A0B; \
+      (state)->A[0] = A0; \
+      (state)->A[1] = A1; \
+      (state)->A[2] = A2; \
+      (state)->A[3] = A3; \
+      (state)->A[4] = A4; \
+      (state)->A[5] = A5; \
+      (state)->A[6] = A6; \
+      (state)->A[7] = A7; \
+      (state)->A[8] = A8; \
+      (state)->A[9] = A9; \
+      (state)->A[10] = AA; \
+      (state)->A[11] = AB; \
      (state)->B[0] = B0; \
      (state)->B[1] = B1; \
      (state)->B[2] = B2; \
@@ -286,8 +286,8 @@ do { \

 #define XOR_W8 \
 do { \
-   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
-   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+   A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
+   A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
 } while (0)

 #define SWAP_BC8 \
@@ -321,60 +321,60 @@ do { \
 } while (0)

 #define PERM_STEP_0_8   do { \
-      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
   } while (0)

 #define PERM_STEP_1_8   do { \
-      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
   } while (0)

 #define PERM_STEP_2_8   do { \
-      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
-      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
-      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
-      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
-      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
-      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
-      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
-      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
-      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
-      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
-      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
-      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
-      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
-      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
-      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
-      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+      PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
   } while (0)

 #define APPLY_P8 \
@@ -398,42 +398,42 @@ do { \
    PERM_STEP_0_8; \
    PERM_STEP_1_8; \
    PERM_STEP_2_8; \
-    A0B = _mm256_add_epi32( A0B, C6 ); \
-    A0A = _mm256_add_epi32( A0A, C5 ); \
-    A09 = _mm256_add_epi32( A09, C4 ); \
-    A08 = _mm256_add_epi32( A08, C3 ); \
-    A07 = _mm256_add_epi32( A07, C2 ); \
-    A06 = _mm256_add_epi32( A06, C1 ); \
-    A05 = _mm256_add_epi32( A05, C0 ); \
-    A04 = _mm256_add_epi32( A04, CF ); \
-    A03 = _mm256_add_epi32( A03, CE ); \
-    A02 = _mm256_add_epi32( A02, CD ); \
-    A01 = _mm256_add_epi32( A01, CC ); \
-    A00 = _mm256_add_epi32( A00, CB ); \
-    A0B = _mm256_add_epi32( A0B, CA ); \
-    A0A = _mm256_add_epi32( A0A, C9 ); \
-    A09 = _mm256_add_epi32( A09, C8 ); \
-    A08 = _mm256_add_epi32( A08, C7 ); \
-    A07 = _mm256_add_epi32( A07, C6 ); \
-    A06 = _mm256_add_epi32( A06, C5 ); \
-    A05 = _mm256_add_epi32( A05, C4 ); \
-    A04 = _mm256_add_epi32( A04, C3 ); \
-    A03 = _mm256_add_epi32( A03, C2 ); \
-    A02 = _mm256_add_epi32( A02, C1 ); \
-    A01 = _mm256_add_epi32( A01, C0 ); \
-    A00 = _mm256_add_epi32( A00, CF ); \
-    A0B = _mm256_add_epi32( A0B, CE ); \
-    A0A = _mm256_add_epi32( A0A, CD ); \
-    A09 = _mm256_add_epi32( A09, CC ); \
-    A08 = _mm256_add_epi32( A08, CB ); \
-    A07 = _mm256_add_epi32( A07, CA ); \
-    A06 = _mm256_add_epi32( A06, C9 ); \
-    A05 = _mm256_add_epi32( A05, C8 ); \
-    A04 = _mm256_add_epi32( A04, C7 ); \
-    A03 = _mm256_add_epi32( A03, C6 ); \
-    A02 = _mm256_add_epi32( A02, C5 ); \
-    A01 = _mm256_add_epi32( A01, C4 ); \
-    A00 = _mm256_add_epi32( A00, C3 ); \
+    AB = _mm256_add_epi32( AB, C6 ); \
+    AA = _mm256_add_epi32( AA, C5 ); \
+    A9 = _mm256_add_epi32( A9, C4 ); \
+    A8 = _mm256_add_epi32( A8, C3 ); \
+    A7 = _mm256_add_epi32( A7, C2 ); \
+    A6 = _mm256_add_epi32( A6, C1 ); \
+    A5 = _mm256_add_epi32( A5, C0 ); \
+    A4 = _mm256_add_epi32( A4, CF ); \
+    A3 = _mm256_add_epi32( A3, CE ); \
+    A2 = _mm256_add_epi32( A2, CD ); \
+    A1 = _mm256_add_epi32( A1, CC ); \
+    A0 = _mm256_add_epi32( A0, CB ); \
+    AB = _mm256_add_epi32( AB, CA ); \
+    AA = _mm256_add_epi32( AA, C9 ); \
+    A9 = _mm256_add_epi32( A9, C8 ); \
+    A8 = _mm256_add_epi32( A8, C7 ); \
+    A7 = _mm256_add_epi32( A7, C6 ); \
+    A6 = _mm256_add_epi32( A6, C5 ); \
+    A5 = _mm256_add_epi32( A5, C4 ); \
+    A4 = _mm256_add_epi32( A4, C3 ); \
+    A3 = _mm256_add_epi32( A3, C2 ); \
+    A2 = _mm256_add_epi32( A2, C1 ); \
+    A1 = _mm256_add_epi32( A1, C0 ); \
+    A0 = _mm256_add_epi32( A0, CF ); \
+    AB = _mm256_add_epi32( AB, CE ); \
+    AA = _mm256_add_epi32( AA, CD ); \
+    A9 = _mm256_add_epi32( A9, CC ); \
+    A8 = _mm256_add_epi32( A8, CB ); \
+    A7 = _mm256_add_epi32( A7, CA ); \
+    A6 = _mm256_add_epi32( A6, C9 ); \
+    A5 = _mm256_add_epi32( A5, C8 ); \
+    A4 = _mm256_add_epi32( A4, C7 ); \
+    A3 = _mm256_add_epi32( A3, C6 ); \
+    A2 = _mm256_add_epi32( A2, C5 ); \
+    A1 = _mm256_add_epi32( A1, C4 ); \
+    A0 = _mm256_add_epi32( A0, C3 ); \
 } while (0)

 #define INCR_W8   do { \
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)


 #define DECL_STATE   \
-	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
-	        A08, A09, A0A, A0B; \
+	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
+	        A8, A9, AA, AB; \
 	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
 	        B8, B9, BA, BB, BC, BD, BE, BF; \
 	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 { \
   if ( (state)->state_loaded ) \
   { \
-      A00 = (state)->A[0]; \
-		A01 = (state)->A[1]; \
-		A02 = (state)->A[2]; \
-		A03 = (state)->A[3]; \
-		A04 = (state)->A[4]; \
-		A05 = (state)->A[5]; \
-		A06 = (state)->A[6]; \
-		A07 = (state)->A[7]; \
-		A08 = (state)->A[8]; \
-		A09 = (state)->A[9]; \
-		A0A = (state)->A[10]; \
-		A0B = (state)->A[11]; \
+      A0 = (state)->A[0]; \
+		A1 = (state)->A[1]; \
+		A2 = (state)->A[2]; \
+		A3 = (state)->A[3]; \
+		A4 = (state)->A[4]; \
+		A5 = (state)->A[5]; \
+		A6 = (state)->A[6]; \
+		A7 = (state)->A[7]; \
+		A8 = (state)->A[8]; \
+		A9 = (state)->A[9]; \
+		AA = (state)->A[10]; \
+		AB = (state)->A[11]; \
 		B0 = (state)->B[0]; \
 		B1 = (state)->B[1]; \
 		B2 = (state)->B[2]; \
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   else \
   { \
       (state)->state_loaded = true; \
-       A00 = m128_const1_64( 0x20728DFD20728DFD ); \
-       A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
-       A02 = m128_const1_64( 0xE782B699E782B699 ); \
-       A03 = m128_const1_64( 0x5530463255304632 ); \
-       A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
-       A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
-       A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A09 = m128_const1_64( 0x8BD144108BD14410 ); \
-       A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
-       A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
+       A0 = m128_const1_64( 0x20728DFD20728DFD ); \
+       A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
+       A2 = m128_const1_64( 0xE782B699E782B699 ); \
+       A3 = m128_const1_64( 0x5530463255304632 ); \
+       A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
+       A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
+       A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A9 = m128_const1_64( 0x8BD144108BD14410 ); \
+       AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
+       AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
       B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
       B1 = m128_const1_64( 0x07B385F307B385F3 ); \
       B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 } while (0)

 #define WRITE_STATE(state)   do { \
-		(state)->A[0] = A00; \
-		(state)->A[1] = A01; \
-		(state)->A[2] = A02; \
-		(state)->A[3] = A03; \
-		(state)->A[4] = A04; \
-		(state)->A[5] = A05; \
-		(state)->A[6] = A06; \
-		(state)->A[7] = A07; \
-		(state)->A[8] = A08; \
-		(state)->A[9] = A09; \
-		(state)->A[10] = A0A; \
-		(state)->A[11] = A0B; \
+		(state)->A[0] = A0; \
+		(state)->A[1] = A1; \
+		(state)->A[2] = A2; \
+		(state)->A[3] = A3; \
+		(state)->A[4] = A4; \
+		(state)->A[5] = A5; \
+		(state)->A[6] = A6; \
+		(state)->A[7] = A7; \
+		(state)->A[8] = A8; \
+		(state)->A[9] = A9; \
+		(state)->A[10] = AA; \
+		(state)->A[11] = AB; \
 		(state)->B[0] = B0; \
 		(state)->B[1] = B1; \
 		(state)->B[2] = B2; \
@@ -884,8 +884,8 @@ do { \

 #define XOR_W \
 do { \
-   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
-   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+   A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
+   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)


@@ -940,60 +940,60 @@ do { \
 } while (0)

 #define PERM_STEP_0   do { \
-		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
 	} while (0)

 #define PERM_STEP_1   do { \
-		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
 	} while (0)

 #define PERM_STEP_2   do { \
-		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
-		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
-		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
-		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
-		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
-		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
-		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
-		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
-		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
-		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
-		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
-		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
-		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
-		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
-		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
-		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+		PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
 	} while (0)

 #define APPLY_P \
@@ -1017,42 +1017,42 @@ do { \
    PERM_STEP_0; \
    PERM_STEP_1; \
    PERM_STEP_2; \
-    A0B = _mm_add_epi32( A0B, C6 ); \
-    A0A = _mm_add_epi32( A0A, C5 ); \
-    A09 = _mm_add_epi32( A09, C4 ); \
-    A08 = _mm_add_epi32( A08, C3 ); \
-    A07 = _mm_add_epi32( A07, C2 ); \
-    A06 = _mm_add_epi32( A06, C1 ); \
-    A05 = _mm_add_epi32( A05, C0 ); \
-    A04 = _mm_add_epi32( A04, CF ); \
-    A03 = _mm_add_epi32( A03, CE ); \
-    A02 = _mm_add_epi32( A02, CD ); \
-    A01 = _mm_add_epi32( A01, CC ); \
-    A00 = _mm_add_epi32( A00, CB ); \
-    A0B = _mm_add_epi32( A0B, CA ); \
-    A0A = _mm_add_epi32( A0A, C9 ); \
-    A09 = _mm_add_epi32( A09, C8 ); \
-    A08 = _mm_add_epi32( A08, C7 ); \
-    A07 = _mm_add_epi32( A07, C6 ); \
-    A06 = _mm_add_epi32( A06, C5 ); \
-    A05 = _mm_add_epi32( A05, C4 ); \
-    A04 = _mm_add_epi32( A04, C3 ); \
-    A03 = _mm_add_epi32( A03, C2 ); \
-    A02 = _mm_add_epi32( A02, C1 ); \
-    A01 = _mm_add_epi32( A01, C0 ); \
-    A00 = _mm_add_epi32( A00, CF ); \
-    A0B = _mm_add_epi32( A0B, CE ); \
-    A0A = _mm_add_epi32( A0A, CD ); \
-    A09 = _mm_add_epi32( A09, CC ); \
-    A08 = _mm_add_epi32( A08, CB ); \
-    A07 = _mm_add_epi32( A07, CA ); \
-    A06 = _mm_add_epi32( A06, C9 ); \
-    A05 = _mm_add_epi32( A05, C8 ); \
-    A04 = _mm_add_epi32( A04, C7 ); \
-    A03 = _mm_add_epi32( A03, C6 ); \
-    A02 = _mm_add_epi32( A02, C5 ); \
-    A01 = _mm_add_epi32( A01, C4 ); \
-    A00 = _mm_add_epi32( A00, C3 ); \
+    AB = _mm_add_epi32( AB, C6 ); \
+    AA = _mm_add_epi32( AA, C5 ); \
+    A9 = _mm_add_epi32( A9, C4 ); \
+    A8 = _mm_add_epi32( A8, C3 ); \
+    A7 = _mm_add_epi32( A7, C2 ); \
+    A6 = _mm_add_epi32( A6, C1 ); \
+    A5 = _mm_add_epi32( A5, C0 ); \
+    A4 = _mm_add_epi32( A4, CF ); \
+    A3 = _mm_add_epi32( A3, CE ); \
+    A2 = _mm_add_epi32( A2, CD ); \
+    A1 = _mm_add_epi32( A1, CC ); \
+    A0 = _mm_add_epi32( A0, CB ); \
+    AB = _mm_add_epi32( AB, CA ); \
+    AA = _mm_add_epi32( AA, C9 ); \
+    A9 = _mm_add_epi32( A9, C8 ); \
+    A8 = _mm_add_epi32( A8, C7 ); \
+    A7 = _mm_add_epi32( A7, C6 ); \
+    A6 = _mm_add_epi32( A6, C5 ); \
+    A5 = _mm_add_epi32( A5, C4 ); \
+    A4 = _mm_add_epi32( A4, C3 ); \
+    A3 = _mm_add_epi32( A3, C2 ); \
+    A2 = _mm_add_epi32( A2, C1 ); \
+    A1 = _mm_add_epi32( A1, C0 ); \
+    A0 = _mm_add_epi32( A0, CF ); \
+    AB = _mm_add_epi32( AB, CE ); \
+    AA = _mm_add_epi32( AA, CD ); \
+    A9 = _mm_add_epi32( A9, CC ); \
+    A8 = _mm_add_epi32( A8, CB ); \
+    A7 = _mm_add_epi32( A7, CA ); \
+    A6 = _mm_add_epi32( A6, C9 ); \
+    A5 = _mm_add_epi32( A5, C8 ); \
+    A4 = _mm_add_epi32( A4, C7 ); \
+    A3 = _mm_add_epi32( A3, C6 ); \
+    A2 = _mm_add_epi32( A2, C5 ); \
+    A1 = _mm_add_epi32( A1, C4 ); \
+    A0 = _mm_add_epi32( A0, C3 ); \
 } while (0)

 #define INCR_W   do { \
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };

-
+/*
 #define mm256_ror2x256hi_1x32( a, b ) \
   _mm256_blend_epi32( mm256_shuflr128_32( a ), \
                       mm256_shuflr128_32( b ), 0x88 )
+*/
+
+//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )

 #if defined(__VAES__)

@@ -127,24 +130,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     
     // round 2, 6, 10

-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p2 = _mm256_xor_si256( p2, x );

-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

     p0 = _mm256_xor_si256( p0, x );
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     // round 4, 8, 12

-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p0 = _mm256_xor_si256( p0, x );

-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

     p2 = _mm256_xor_si256( p2, x );
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };

-#define mm512_ror2x512hi_1x32( a, b ) \
-   _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
-                                    mm512_shuflr128_32( b ) )
-
 static void
 c512_4way( shavite512_4way_context *ctx, const void *msg )
 {
@@ -106,24 +102,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
     
     // round 2, 6, 10

-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );

     P2 = _mm512_xor_si512( P2, X );

-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );

     P0 = _mm512_xor_si512( P0, X );
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )

     // round 4, 8, 12

-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );

     P0 = _mm512_xor_si512( P0, X );

-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );

     P2 = _mm512_xor_si512( P2, X );
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };

-// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
-// and return the rotated 128 bit vector a.
-// a[3:0] = { b[0], a[3], a[2], a[1] }
-#if defined(__SSSE3__)
-
-#define mm128_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
-
-#else  // SSE2
-
-#define mm128_ror256hi_1x32( a, b ) \
-   _mm_or_si128( _mm_srli_si128( a,  4 ), \
-                 _mm_slli_si128( b, 12 ) )
-
-#endif
-
-/*
-#if defined(__AVX2__)
-// 2 way version of above
-// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
-#define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror256_1x32( a ), \
-                       mm256_rol256_3x32( b ), 0x88 )
-#endif
-*/

 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, zero );

      p2 = _mm_xor_si128( p2, x );

-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, zero );

@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, zero );

      p0 = _mm_xor_si128( p0, x );

-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, zero );

--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -35,7 +35,7 @@

 #include "sph_shavite.h"

-#if !defined(__AES__)
+#if !(defined(__AES__) && defined(__SSSE3__))

 #ifdef __cplusplus
 extern "C"{
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

 //Don't call these directly from application code, use the macros below.
-#ifdef __AES__
+#if defined(__AES__) && defined(__SSSE3__)

 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
--- a/algo/sm3/sph_sm3.h
+++ b/algo/sm3/sph_sm3.h
@@ -74,7 +74,7 @@ typedef struct {

 void sm3_init(sm3_ctx_t *ctx);
 void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
-void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest);
 void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
 void sm3(const unsigned char *data, size_t datalen,
 	unsigned char digest[SM3_DIGEST_LENGTH]);
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -10,6 +10,7 @@
 #include "algo-gate-api.h"
 #include "Verthash.h"
 #include "mm_malloc.h"
+#include "malloc-huge.h"

 //-----------------------------------------------------------------------------
 // Verthash info management
@@ -84,10 +85,17 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
    }

    // Allocate data
-    info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
-    if (!info->data)
+    info->data = (uint8_t *)malloc_hugepages( fileSize );
+    if ( info->data )
    {
-        fclose(fileMiningData);
+       if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
+    }
+    else
+       info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
+
+    if ( !info->data )
+    {
+        fclose( fileMiningData );
        // Memory allocation fatal error.
        return 2;
    }
--- a/algo/verthash/tiny_sha3/sha3-4way.c
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
   for ( r = 0; r < KECCAKF_ROUNDS; r++ )
   {
      // Theta
-      bc[0] = _mm256_xor_si256( st[0],
-                           mm256_xor4( st[5], st[10], st[15], st[20] ) );
-      bc[1] = _mm256_xor_si256( st[1],
-                           mm256_xor4( st[6], st[11], st[16], st[21] ) );
-      bc[2] = _mm256_xor_si256( st[2],
-                           mm256_xor4( st[7], st[12], st[17], st[22] ) );
-      bc[3] = _mm256_xor_si256( st[3],
-                           mm256_xor4( st[8], st[13], st[18], st[23] ) );
-      bc[4] = _mm256_xor_si256( st[4],
-                           mm256_xor4( st[9], st[14], st[19], st[24] ) );
+      bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
+      bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
+      bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
+      bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
+      bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );

      for ( i = 0; i < 5; i++ )
      {
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
      //  Chi
      for ( j = 0; j < 25; j += 5 )
      {
-         memcpy( bc, &st[ j ], 5*32 );
-         st[ j   ] = _mm256_xor_si256( st[ j   ],
-                                       _mm256_andnot_si256( bc[1], bc[2] ) );
-         st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
-                                       _mm256_andnot_si256( bc[2], bc[3] ) );
-         st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
-                                       _mm256_andnot_si256( bc[3], bc[4] ) );
-         st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
-                                       _mm256_andnot_si256( bc[4], bc[0] ) );
-         st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
-                                       _mm256_andnot_si256( bc[0], bc[1] ) );
+         bc[0] = st[j];
+         bc[1] = st[j+1];
+         st[ j   ] = mm256_xorandnot( st[ j   ], st[j+1], st[j+2] );
+         st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
+         st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
+         st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
+         st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
      }

      //  Iota
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
 {
  opt_target_factor = 256.0;
  gate->scanhash  = (void*)&scanhash_verthash;
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
   
  const char *verthash_data_file = opt_data_file ? opt_data_file
                                                 : default_verthash_data_file;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -16,7 +16,8 @@

 #if defined (X16R_8WAY)

-// Perform midstate prehash of hash functions with block size <= 72 bytes.
+// Perform midstate prehash of hash functions with block size <= 72 bytes,
+// 76 bytes for hash functions that operate on 32 bit data.

 void x16r_8way_prehash( void *vdata, void *pdata )
 {
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x16r_ctx.luffa, 512 );
-         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                  ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
+         intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                  ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );            
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
+                                        ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
 int x16r_8way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t hash4[20] __attribute__ ((aligned (64)));
-   uint32_t hash5[20] __attribute__ ((aligned (64)));
-   uint32_t hash6[20] __attribute__ ((aligned (64)));
-   uint32_t hash7[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (16)));
+   uint32_t hash1[20] __attribute__ ((aligned (16)));
+   uint32_t hash2[20] __attribute__ ((aligned (16)));
+   uint32_t hash3[20] __attribute__ ((aligned (16)));
+   uint32_t hash4[20] __attribute__ ((aligned (16)));
+   uint32_t hash5[20] __attribute__ ((aligned (16)));
+   uint32_t hash6[20] __attribute__ ((aligned (16)));
+   uint32_t hash7[20] __attribute__ ((aligned (16)));
   x16r_8way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
      s_ntime = ntime;

      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+          applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_8way_prehash( vdata, pdata );
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
         skein512_4way_prehash64( &x16r_ctx.skein, vdata );
      break;
      case LUFFA:
+      {
+         hashState_luffa ctx_luffa;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         luffa_2way_init( &x16r_ctx.luffa, 512 );
-         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
-         break;
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         init_luffa( &ctx_luffa, 512 );
+         update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
+         intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
+                                              ctx_luffa.buffer, 512 );
+         intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
+                                              ctx_luffa.chainv, 1280 );
+         x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
+         x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
+      }
+      break;
      case CUBEHASH:
+      {
+         cubehashParam ctx_cube;
         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
-         cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         cubehashInit( &ctx_cube, 512, 16, 32 );
+         cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
+         x16r_ctx.cube.hashlen = ctx_cube.hashlen;
+         x16r_ctx.cube.rounds = ctx_cube.rounds;
+         x16r_ctx.cube.blocksize = ctx_cube.blocksize;
+         x16r_ctx.cube.pos = ctx_cube.pos;
+         intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
+      }
      break;
      case HAMSI:
         mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
 int x16r_4way_hash_generic( void* output, const void* input, int thrid )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16r_4way_context_overlay ctx;
   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-         applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
+         applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
   }

   x16r_4way_prehash( vdata, pdata );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -24,15 +24,15 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   if ( bench )   ptarget[7] = 0x0cff;

   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

   x16r_8way_prehash( vdata, pdata );
@@ -78,15 +78,15 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
   if ( bench )  ptarget[7] = 0x0cff;

   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
-      if ( opt_debug && !thr_id )
-          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+      s_ntime = masked_ntime;
+      if ( !thr_id )
+          applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }

   x16r_4way_prehash( vdata, pdata );
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -20,15 +20,15 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
   mm128_bswap32_80( edata, pdata );

   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = swab32( pdata[17] );
-   if ( s_ntime != ntime )
+   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
   
   x16r_prehash( edata, pdata );
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
 int x16rv2_8way_hash( void* output, const void* input, int thrid )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t hash0[24] __attribute__ ((aligned (32)));
+   uint32_t hash1[24] __attribute__ ((aligned (32)));
+   uint32_t hash2[24] __attribute__ ((aligned (32)));
+   uint32_t hash3[24] __attribute__ ((aligned (32)));
+   uint32_t hash4[24] __attribute__ ((aligned (32)));
+   uint32_t hash5[24] __attribute__ ((aligned (32)));
+   uint32_t hash6[24] __attribute__ ((aligned (32)));
+   uint32_t hash7[24] __attribute__ ((aligned (32)));
   x16rv2_8way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )

 int x16rv2_4way_hash( void* output, const void* input, int thrid )
 {
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (32)));
+   uint32_t hash1[20] __attribute__ ((aligned (32)));
+   uint32_t hash2[20] __attribute__ ((aligned (32)));
+   uint32_t hash3[20] __attribute__ ((aligned (32)));
   x16rv2_4way_context_overlay ctx;
   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
   void *in0 = (void*) hash0;
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t edata[20];
+   uint32_t bedata1[2];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0fff;
   
-
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );

--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     sonoa_8way_context_overlay ctx;

 // 1
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;

 int sonoa_4way_hash( void *state, const void *input, int thr_id )
 {
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     sonoa_4way_context_overlay ctx;

 // 1
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -58,23 +58,27 @@ union _x17_8way_context_overlay
 } __attribute__ ((aligned (64)));
 typedef union _x17_8way_context_overlay x17_8way_context_overlay;

+static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+
 int x17_8way_hash( void *state, const void *input, int thr_id )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
+     uint64_t hash4[8] __attribute__ ((aligned (32)));
+     uint64_t hash5[8] __attribute__ ((aligned (32)));
+     uint64_t hash6[8] __attribute__ ((aligned (32)));
+     uint64_t hash7[8] __attribute__ ((aligned (32)));
     x17_8way_context_overlay ctx;

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
-
+     blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                             x17_8way_midstate );
+     
     bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );

 #if defined(__VAES__)
@@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )

     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
     
-//     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-//     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
-
 #if defined(__VAES__)

     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
@@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
     return 1;
 }

+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
+   uint32_t *hash32_d7 = &(hash32[7*8]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t targ32_d7 = ptarget[7];
+   const __m512i eight = m512_const1_64( 8 );
+   const bool bench = opt_benchmark;
+
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
+
+   mm512_intrlv80_8x64( vdata, edata );
+
+   *noncev = mm512_intrlv_blend_32( *noncev,
+                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
+                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
+   
+   do
+   {
+      if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         {
+            pdata[19] =  n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      *noncev = _mm512_add_epi32( *noncev, eight );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+
 #elif defined(X17_4WAY)

 union _x17_4way_context_overlay
@@ -271,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (32)));
+     uint64_t hash1[8] __attribute__ ((aligned (32)));
+     uint64_t hash2[8] __attribute__ ((aligned (32)));
+     uint64_t hash3[8] __attribute__ ((aligned (32)));
     x17_4way_context_overlay ctx;

     blake512_4way_full( &ctx.blake, vhash, input, 80 );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -3,7 +3,7 @@
 bool register_x17_algo( algo_gate_t* gate )
 {
 #if defined (X17_8WAY)
-  gate->scanhash  = (void*)&scanhash_8way_64in_32out;
+  gate->scanhash  = (void*)&scanhash_x17_8way;
  gate->hash      = (void*)&x17_8way_hash;
 #elif defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_4way_64in_32out;
--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate );

 #if defined(X17_8WAY)

+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
 int x17_8way_hash( void *state, const void *input, int thr_id );

 #elif defined(X17_4WAY)

+int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
 int x17_4way_hash( void *state, const void *input, int thr_id );

 #endif
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
-     uint64_t hash4[16] __attribute__ ((aligned (64)));
-     uint64_t hash5[16] __attribute__ ((aligned (64)));
-     uint64_t hash6[16] __attribute__ ((aligned (64)));
-     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
+     uint64_t hash4[16] __attribute__ ((aligned (32)));
+     uint64_t hash5[16] __attribute__ ((aligned (32)));
+     uint64_t hash6[16] __attribute__ ((aligned (32)));
+     uint64_t hash7[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;

 int xevan_4way_hash( void *output, const void *input, int thr_id )
 {
-     uint64_t hash0[16] __attribute__ ((aligned (64)));
-     uint64_t hash1[16] __attribute__ ((aligned (64)));
-     uint64_t hash2[16] __attribute__ ((aligned (64)));
-     uint64_t hash3[16] __attribute__ ((aligned (64)));
     uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
     uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (32)));
+     uint64_t hash1[16] __attribute__ ((aligned (32)));
+     uint64_t hash2[16] __attribute__ ((aligned (32)));
+     uint64_t hash3[16] __attribute__ ((aligned (32)));
     const int dataLen = 128;
     xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -21,7 +21,6 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/swifftx/swifftx.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-4way.h"
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
                        AVX512_OPT | VAES_OPT;
+  InitializeSWIFFTX();
  return true;
 };

--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -5,6 +5,7 @@
 #include "simd-utils.h"
 #include <stdint.h>
 #include <unistd.h>
+#include "algo/swifftx/swifftx.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define X22I_8WAY 1
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -24,7 +24,6 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/swifftx/swifftx.h"
 #include "algo/panama/panama-hash-4way.h"
 #include "algo/lanehash/lane.h"
 #if defined(__VAES__)
@@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay
 };
 typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;

+static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
+
 int x25x_8way_hash( void *output, const void *input, int thrid )
 {
   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
   x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_8way_init( &ctx.blake );
-   blake512_8way_update( &ctx.blake, input, 80 );
-   blake512_8way_close( &ctx.blake, vhash );
+   blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
+                                                x25x_8way_midstate );
+
   dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
                     hash4[0], hash5[0], hash6[0], hash7[0], vhash );

@@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
                           hash4[10], hash5[10], hash6[10], hash7[10] );

-   
 #else

   init_echo( &ctx.echo, 512 );
@@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hashd7 = &(hash[7*8]);
   uint32_t *pdata = work->data;
@@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
   const bool bench = opt_benchmark;
-
+   const __m512i eight = m512_const1_64( 8 );
   if ( bench )  ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); 
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );   
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );   
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );   
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );   
+
+   mm512_intrlv80_8x64( vdata, edata );
+
+   *noncev = mm512_intrlv_blend_32( *noncev,
+                           _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
+                                             0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); 

-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
      if ( x25x_8way_hash( hash, vdata, thr_id ) );
@@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( likely( valid_hash( lane_hash, ptarget ) ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+      *noncev = _mm512_add_epi32( *noncev, eight );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay
    panama_4way_context     panama;
    blake2s_4way_state      blake2s;
 };
+
 typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;

+static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
+static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
+
 int x25x_4way_hash( void *output, const void *input, int thrid )
 {
   uint64_t vhash[8*4] __attribute__ ((aligned (128)));
@@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));

-   blake512_4way_full( &ctx.blake, vhash, input, 80 );
+   blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
+                                                x25x_4way_midstate );
+   
   dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );

   bmw512_4way_init( &ctx.bmw );
@@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   __m128i edata[5] __attribute__ ((aligned (64)));
   uint32_t *hashd7 = &(hash[ 7*4 ]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
+   const __m256i four = m256_const1_64( 4 );
   const bool bench = opt_benchmark;

   if ( bench ) ptarget[7] = 0x08ff;

-   InitializeSWIFFTX();
+   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
+   edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
+   edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
+   edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
+   edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );

-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32(
-                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   mm256_intrlv80_4x64( vdata, edata );
+
+   *noncev = mm256_intrlv_blend_32( *noncev,
+                           _mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
+   blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
+   
   do
   {
      if ( x25x_4way_hash( hash, vdata, thr_id ) )
@@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( valid_hash( lane_hash, ptarget ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+      *noncev = _mm256_add_epi32( *noncev, four );
      n += 4;
   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -36,8 +36,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
 # AVX2 SHA AES: AMD Zen1
 make clean || echo done
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
-#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
--- a/build-msys2.sh
+++ b/build-msys2.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# Compile on Windows using MSYS2 and MinGW.
+
+make distclean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
+make -j 4
+strip -s cpuminer
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.9.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.2'
-PACKAGE_STRING='cpuminer-opt 3.19.2'
+PACKAGE_VERSION='3.19.9'
+PACKAGE_STRING='cpuminer-opt 3.19.9'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.9 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.9:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.19.2
+cpuminer-opt configure 3.19.9
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.19.2, which was
+It was created by cpuminer-opt $as_me 3.19.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.19.2'
+ VERSION='3.19.9'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.2, which was
+This file was extended by cpuminer-opt $as_me 3.19.9, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.2
+cpuminer-opt config.status 3.19.9
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.2])
+AC_INIT([cpuminer-opt], [3.19.9])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,8 +105,9 @@ bool opt_randomize = false;
 static int opt_retries = -1;
 static int opt_fail_pause = 10;
 static int opt_time_limit = 0;
+static unsigned int time_limit_stop = 0;
 int opt_timeout = 300;
-static int opt_scantime = 5;
+static int opt_scantime = 0;
 const int min_scantime = 1;
 //static const bool opt_time = true;
 enum algos opt_algo = ALGO_NULL;
@@ -127,6 +128,12 @@ char *short_url = NULL;
 char *coinbase_address;
 char *opt_data_file = NULL;
 bool opt_verify = false;
+static bool opt_stratum_keepalive = false;
+static struct timeval stratum_keepalive_timer;
+// Stratum typically times out in 5 minutes or 300 seconds
+#define stratum_keepalive_timeout 180  // 3 minutes
+static struct timeval stratum_reset_time;
+

 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
@@ -187,7 +194,6 @@ int default_api_listen = 4048;
 static struct   timeval session_start;
 static struct   timeval five_min_start;
 static uint64_t session_first_block = 0;
-static double   latency_sum = 0.;
 static uint64_t submit_sum  = 0;
 static uint64_t accept_sum  = 0;
 static uint64_t stale_sum  = 0;
@@ -336,6 +342,7 @@ void get_currentalgo(char* buf, int sz)

 void proper_exit(int reason)
 {
+   if (opt_debug) applog(LOG_INFO,"Program exit");
 #ifdef WIN32
 	if (opt_background) {
 		HWND hcon = GetConsoleWindow();
@@ -1092,7 +1099,7 @@ void report_summary_log( bool force )
   sprintf_et( et_str, et.tv_sec );
   sprintf_et( upt_str, uptime.tv_sec );

-   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
+   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url );
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
            submit_rate, safe_div( (double)submitted_share_count*60.,
@@ -1143,7 +1150,7 @@ void report_summary_log( bool force )
               solved, solved_block_count );
   }
   if ( stratum_errors )
-      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+      applog2( LOG_INFO, "Stratum resets               %7d", stratum_errors );

   applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
            highest_share, lowest_share );
@@ -1274,7 +1281,6 @@ static int share_result( int result, struct work *work,
      else          reject_sum++;
   }
   submit_sum++;
-   latency_sum += latency;

   pthread_mutex_unlock( &stats_lock );

@@ -1290,9 +1296,9 @@ static int share_result( int result, struct work *work,
     else              rcol = CL_LRD;
   }

-   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)",
           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
-           bres, share_time, latency );
+           bres, CL_N, share_time, latency );

   if ( unlikely( opt_debug || !result || solved ) )
   {
@@ -2110,7 +2116,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                             g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
+      applog( LOG_INFO, "Extranonce2 0x%s, Block %d, Job %s",
                        xnonce2str, sctx->block_height, g_work->job_id );
      free( xnonce2str );
   }
@@ -2197,8 +2203,6 @@ static void *miner_thread( void *userdata )
 //                      : 0;
   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - 0x20;

-   time_t   firstwork_time = 0;
-   int  i;
   memset( &work, 0, sizeof(work) );
 
   /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
@@ -2242,7 +2246,7 @@ static void *miner_thread( void *userdata )

   if ( !algo_gate.miner_thread_init( thr_id ) )
   {
-      applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
+      applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id );
      exit (1);
   }

@@ -2270,22 +2274,34 @@ static void *miner_thread( void *userdata )
          {
             while ( unlikely( stratum_down ) )
                sleep( 1 );
-             if ( *nonceptr >= end_nonce )
-                stratum_gen_work( &stratum, &g_work );
+             if ( unlikely( ( *nonceptr >= end_nonce )
+                         && !work_restart[thr_id].restart ) )
+             {
+                if ( opt_extranonce )
+                   stratum_gen_work( &stratum, &g_work );
+                else
+                {
+                   if ( !thr_id )
+                   {
+                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
+                      applog( LOG_WARNING, "waiting for new work...");
+                   }
+                   while ( !work_restart[thr_id].restart )
+                      sleep ( 1 );
+                }
+             }
          }
-          else
+          else if ( !opt_benchmark ) // GBT or getwork
          {
             pthread_rwlock_wrlock( &g_work_lock );

-             if ( ( ( time(NULL) - g_work_time )
-                 >= ( have_longpoll ? LP_SCANTIME : opt_scantime ) )
+             if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
               || ( *nonceptr >= end_nonce ) )
             {
                if ( unlikely( !get_work( mythr, &g_work ) ) )
                {
                   pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting "
-		                              "mining thread %d", thr_id );
+		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
 		             goto out;
 	             }
                g_work_time = time(NULL);
@@ -2308,25 +2324,14 @@ static void *miner_thread( void *userdata )
       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
          continue;

-// LP_SCANTIME overrides opt_scantime option, is this right?
-
-       // adjust max_nonce to meet target scan time. Stratum and longpoll
-       // can go longer because they can rely on restart_threads to signal
-       // an early abort. get_work on the other hand can't rely on
-       // restart_threads so need a much shorter scantime
-       if ( have_stratum )
-          max64 = 60 * thr_hashrates[thr_id];
-       else if ( have_longpoll )
-          max64 = LP_SCANTIME * thr_hashrates[thr_id];
-       else  // getwork inline
-          max64 = opt_scantime * thr_hashrates[thr_id];   
+       // opt_scantime expressed in hashes
+       max64 = opt_scantime * thr_hashrates[thr_id];

       // time limit
-       if ( unlikely( opt_time_limit && firstwork_time ) )
+       if ( unlikely( opt_time_limit ) )
       {
-          int passed = (int)( time(NULL) - firstwork_time );
-          int remain = (int)( opt_time_limit - passed );
-          if ( remain < 0 )
+          unsigned int now = (unsigned int)time(NULL);
+          if ( now >= time_limit_stop )
          {
             if ( thr_id != 0 )
             {
@@ -2338,14 +2343,16 @@ static void *miner_thread( void *userdata )
                char rate[32];
                format_hashrate( global_hashrate, rate );
                applog( LOG_NOTICE, "Benchmark: %s", rate );
-                fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate);
             }
             else
-                applog( LOG_NOTICE,
-	          "Mining timeout of %ds reached, exiting...", opt_time_limit);
-	       proper_exit(0);
+                applog( LOG_NOTICE, "Mining timeout of %ds reached, exiting...",
+                        opt_time_limit);
+
+             proper_exit(0);
          }
-          if ( remain < max64 ) max64 = remain;
+          // else
+          if ( time_limit_stop - now < opt_scantime )
+              max64 = ( time_limit_stop - now ) * thr_hashrates[thr_id] ;
       }

       // Select nonce range based on max64, the estimated number of hashes
@@ -2361,8 +2368,6 @@ static void *miner_thread( void *userdata )
          max_nonce = work_nonce + (uint32_t)max64;

       // init time
-       if ( firstwork_time == 0 )
-          firstwork_time = time(NULL);
       hashes_done = 0;
       gettimeofday( (struct timeval *) &tv_start, NULL );

@@ -2435,7 +2440,7 @@ static void *miner_thread( void *userdata )
       {
          double hashrate  = 0.;
          pthread_mutex_lock( &stats_lock );
-          for ( i = 0; i < opt_n_threads; i++ )
+          for ( int i = 0; i < opt_n_threads; i++ )
              hashrate  += thr_hashrates[i];
          global_hashrate  = hashrate;
          pthread_mutex_unlock( &stats_lock );
@@ -2729,6 +2734,18 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
          sctx->job.final_sapling_hash );
 }

+// Loop is out of order:
+//
+//   connect/reconnect
+//   handle message
+//   get new message
+//
+// change to
+//   connect/reconnect
+//   get new message
+//   handle message
+
+
 static void *stratum_thread(void *userdata )
 {
   struct thr_info *mythr = (struct thr_info *) userdata;
@@ -2737,7 +2754,7 @@ static void *stratum_thread(void *userdata )
   stratum.url = (char*) tq_pop(mythr->q, NULL);
   if (!stratum.url)
      goto out;
-   applog( LOG_BLUE, "Stratum connect %s", short_url );
+   applog( LOG_BLUE, "Stratum connect %s", stratum.url );

   while (1)
   {
@@ -2746,6 +2763,7 @@ static void *stratum_thread(void *userdata )
      if ( unlikely( stratum_need_reset ) )
      {
          stratum_need_reset = false;
+          gettimeofday( &stratum_reset_time, NULL );
          stratum_down = true;
          stratum_errors++;
          stratum_disconnect( &stratum );
@@ -2756,7 +2774,7 @@ static void *stratum_thread(void *userdata )
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
          }
          else 
-	          applog(LOG_WARNING, "Stratum connection reset");
+	          applog(LOG_BLUE, "Stratum connection reset");
          // reset stats queue as well
          restart_threads();
          if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
@@ -2788,15 +2806,12 @@ static void *stratum_thread(void *userdata )
         {
            stratum_down = false;
            applog(LOG_BLUE,"Stratum connection established" );
+            if ( stratum.new_job )   // prime first job
+               stratum_gen_work( &stratum, &g_work );
         }
      }

-      report_summary_log( ( stratum_diff != stratum.job.diff )
-                       && ( stratum_diff != 0. ) );
-      
-      if ( stratum.new_job )
-         stratum_gen_work( &stratum, &g_work );
-
+      // Wait for new message from server
      if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
      {
         if ( likely( s = stratum_recv_line( &stratum ) ) )
@@ -2819,6 +2834,53 @@ static void *stratum_thread(void *userdata )
 //         stratum_disconnect( &stratum );
      }

+      report_summary_log( ( stratum_diff != stratum.job.diff )
+                       && ( stratum_diff != 0. ) );
+
+      if ( !stratum_need_reset )
+      {
+         // Is keepalive needed? Mutex would normally be required but that
+         // would block any attempt to submit a share. A share is more
+         // important even if it messes up the keepalive.
+
+         if ( opt_stratum_keepalive )
+         {
+            struct timeval now, et;
+            gettimeofday( &now, NULL );
+            // any shares submitted since last keepalive?
+            if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
+               memcpy( &stratum_keepalive_timer, &last_submit_time,
+                       sizeof (struct timeval) );
+
+            timeval_subtract( &et, &now, &stratum_keepalive_timer );
+
+            if ( et.tv_sec > stratum_keepalive_timeout )
+            {
+                double diff = stratum.job.diff * 0.5;
+                stratum_keepalive_timer = now;
+                if ( !opt_quiet )
+                   applog( LOG_BLUE,
+                           "Stratum keepalive requesting lower difficulty" );
+                stratum_suggest_difficulty( &stratum, diff );
+            }
+
+            if ( last_submit_time.tv_sec > stratum_reset_time.tv_sec )
+              timeval_subtract( &et, &now, &last_submit_time );
+            else
+              timeval_subtract( &et, &now, &stratum_reset_time );
+
+            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
+            {
+               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
+               stratum_need_reset = true;
+               stratum_keepalive_timer = now;
+            }
+         } // stratum_keepalive
+
+         if ( stratum.new_job && !stratum_need_reset )
+            stratum_gen_work( &stratum, &g_work );
+
+      } // stratum_need_reset
   }  // loop
 out:
  return NULL;
@@ -2990,8 +3052,8 @@ static bool cpu_capability( bool display_only )
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
-     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
-                   use_sha || use_vaes );
+     use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
+                || use_avx2 || use_sha || use_vaes );

     // Display best options
     printf( "\nStarting miner with" );
@@ -3273,6 +3335,7 @@ void parse_arg(int key, char *arg )
 			if ( strncasecmp( arg, "http://", 7 )
           && strncasecmp( arg, "https://", 8 )
           && strncasecmp( arg, "stratum+tcp://", 14 )
+           && strncasecmp( arg, "stratum+ssl://", 14 )
           && strncasecmp( arg, "stratum+tcps://", 15 ) )
         {
            fprintf(stderr, "unknown protocol -- '%s'\n", arg);
@@ -3407,7 +3470,8 @@ void parse_arg(int key, char *arg )
      break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
-		if (v < 0 || v > 5)	/* sanity check */
+      applog(LOG_NOTICE,"--cpu-priority is deprecated and will be removed from a future release");
+      if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_priority = v;
 		break;
@@ -3443,14 +3507,18 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1024:
 		opt_randomize = true;
-		break;
+      applog(LOG_NOTICE,"--randomize is deprecated and will be removed from a future release");
+      break;
   case 1027:  // data-file
      opt_data_file = strdup( arg );
      break;
   case 1028:  // verify
      opt_verify = true;
      break;
-	case 'V':
+   case 1029:  // stratum-keepalive
+      opt_stratum_keepalive = true;
+      break;
+   case 'V':
      display_cpu_capability();
      exit(0);
 	case 'h':
@@ -3625,6 +3693,17 @@ int main(int argc, char *argv[])
      show_usage_and_exit(1);
   }

+   if ( !opt_scantime )
+   {
+      if      ( have_stratum )  opt_scantime = 30;
+      else if ( have_longpoll ) opt_scantime = LP_SCANTIME;
+      else                      opt_scantime = 5;
+   }
+
+   if ( opt_time_limit )
+      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
+
+
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3690,6 +3769,7 @@ int main(int argc, char *argv[])
   flags = CURL_GLOBAL_ALL;
   if ( !opt_benchmark )
     if ( strncasecmp( rpc_url, "https:", 6 )
+       && strncasecmp( rpc_url, "stratum+ssl://", 14 )
       && strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
         flags &= ~CURL_GLOBAL_SSL;

@@ -3833,6 +3913,8 @@ int main(int argc, char *argv[])
      if ( opt_debug )
         applog(LOG_INFO,"Creating stratum thread");

+      stratum.new_job = false;  // just to make sure
+
      /* init stratum thread info */
 		stratum_thr_id = opt_n_threads + 2;
 		thr = &thr_info[stratum_thr_id];
@@ -3899,6 +3981,8 @@ int main(int argc, char *argv[])
   gettimeofday( &last_submit_time, NULL );
   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &stratum_reset_time, &last_submit_time, sizeof (struct timeval) );
   memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
   pthread_mutex_unlock( &stats_lock );

--- a/malloc-huge.c
+++ b/malloc-huge.c
@@ -0,0 +1,36 @@
+#include "malloc-huge.h"
+#include "miner.h"
+
+#define HUGEPAGE_SIZE_2M  (2 * 1024 * 1024)
+
+void *malloc_hugepages( size_t size )
+{
+#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
+//   applog( LOG_WARNING, "Huge pages not available",size);
+   return NULL;
+#else
+
+   if ( size < HUGEPAGE_MIN_ALLOC )
+   {
+//	   applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
+	   return NULL;
+   }
+
+   const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
+   void *p = NULL;
+   int flags =
+   #ifdef MAP_NOCORE
+                MAP_NOCORE |
+   #endif
+		          MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
+
+   // round size up to next page boundary
+   size = ( size + hugepage_mask ) & (~hugepage_mask);
+   
+   p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
+   if ( p == MAP_FAILED )
+      p = NULL;
+   return p;
+#endif
+}
+
--- a/malloc-huge.h
+++ b/malloc-huge.h
@@ -0,0 +1,24 @@
+#if !(defined(MALLOC_HUGE__))
+#define MALLOC_HUGE__
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __unix__
+#include <sys/mman.h>
+#endif
+
+#if defined(MAP_HUGETLB)
+
+// Minimum block size 6 MiB to use huge pages
+#define HUGEPAGE_MIN_ALLOC    (6 * 1024 * 1024)
+
+#endif
+
+// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
+void *malloc_hugepages( size_t size );
+
+#endif
+
--- a/miner.h
+++ b/miner.h
@@ -466,6 +466,7 @@ void stratum_disconnect(struct stratum_ctx *sctx);
 bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
+bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff );


 extern bool aes_ni_supported;
@@ -811,7 +812,7 @@ Options:\n\
                          lyra2z330     Lyra2 330 rows\n\
                          m7m           Magi (XMG)\n\
                          myr-gr        Myriad-Groestl\n\
-                          minotaur      Ringcoin (RNG)\n\
+                          minotaur\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
@@ -823,6 +824,7 @@ Options:\n\
                          qubit         Qubit\n\
                          scrypt        scrypt(1024, 1, 1) (default)\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
+                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
@@ -885,10 +887,10 @@ Options:\n\
  -T, --timeout=N       timeout for long poll and stratum (default: 300 seconds)\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
                          long polling is unavailable, in seconds (default: 5)\n\
-      --randomize       Randomize scan range start to reduce duplicates\n\
-  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
+      --randomize       randomize scan range (deprecated)\n\
+  -f, --diff-factor=N   divide req. difficulty by this factor (std is 1.0)\n\
  -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
-      --hash-meter      Display thread hash rates\n\
+      --hash-meter      display thread hash rates\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
      --no-longpoll     disable long polling support\n\
@@ -909,15 +911,16 @@ Options:\n\
  -B, --background      run the miner in the background\n\
      --benchmark       run in offline benchmark mode\n\
      --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
-      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)\n\
+      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest) (deprecated)\n\
  -b, --api-bind=address[:port]   IP address for the miner API, default port is 4048)\n\
-      --api-remote      Allow remote control\n\
-      --max-temp=N      Only mine if cpu temp is less than specified value (linux)\n\
-      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
-      --max-diff=N      Only mine if net difficulty is less than specified value\n\
+      --api-remote      allow remote control\n\
+      --max-temp=N      only mine if cpu temp is less than specified value (linux)\n\
+      --max-rate=N[KMG] only mine if net hashrate is less than specified value\n\
+      --max-diff=N      only mine if net difficulty is less than specified value\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
      --data-file=FILE  path and name of data file\n\
      --verify          enable additional time consuming start up tests\n\
+      --stratum-keepalive  prevent disconnects when difficulty is too high\n\
  -V, --version         display version and CPU information and exit\n\
  -h, --help            display this help text and exit\n\
 ";
@@ -987,6 +990,7 @@ static struct option const options[] = {
        { "userpass", 1, NULL, 'O' },
        { "data-file", 1, NULL, 1027 },
        { "verify", 0, NULL, 1028 },
+        { "stratum-keepalive", 0, NULL, 1029 },
        { "version", 0, NULL, 'V' },
        { 0, 0, 0, 0 }
 };
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -272,9 +272,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #endif

+// Mask making
+
+// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
+// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+
+#define mm_movmask_64( v ) \
+   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
+
+#define mm_movmask_32( v ) \
+   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )


-// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+// Diagonal blend

 // Blend 4 32 bit elements from 4 vectors

@@ -284,7 +294,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )

-#elif defined(__SSE4_1)
+#elif defined(__SSE4_1__)

 #define mm128_diagonal_32( v3, v2, v1, v0 ) \
  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
@@ -401,6 +411,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

+// Limited 2 input shuffle, combines shuffle with blend. The destination low
+// half is always taken from src a, and the high half from src b.
+#define mm128_shuffle2_64( a, b, c ) \
+   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
+                                     _mm_castsi128_pd( b ), c ) ); 
+
+#define mm128_shuffle2_32( a, b, c ) \
+   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
+                                     _mm_castsi128_ps( b ), c ) ); 
+
+
 //
 // Rotate vector elements accross all lanes

@@ -532,9 +553,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #if defined(__SSSE3__)

 // Function macro with two inputs and one output, inputs are preserved.
-// Returns modified first arg.
 // Two input functions are not available without SSSE3. Use procedure
-// belowe instead.
+// macros below instead.

 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -548,12 +568,11 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )

-// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
-// Returns both modified args in place.
+// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.

 // These macros retain the vrol/vror name for now to avoid
 // confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 1nputs and
+// These may be renamed to something like shufl2r2 for 2 nputs and
 // 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.

 #define mm128_vror256_64( v1, v2 ) \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -233,6 +233,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 #endif

+// Mask making
+
+// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
+// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+
+#define mm256_movmask_64( v ) \
+   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
+
+#define mm256_movmask_32( v ) \
+   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
+
+
 // Diagonal blending

 // Blend 4 64 bit elements from 4 vectors
@@ -405,6 +417,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

+// Limited 2 input shuffle
+#define mm256_shuffle2_64( a, b, c ) \
+   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
+                                           _mm256_castsi256_pd( b ), c ) ); 
+
+#define mm256_shuffle2_32( a, b, c ) \
+   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
+                                           _mm256_castsi256_ps( b ), c ) ); 
+
+
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -420,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

-//
-// Swap bytes in vector elements, endian bswap.
+// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
+// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
+// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
+// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
+// AVX2 version will work here. The bswap control vector is coded to work
+// with both versions, bit 4 is ignored in AVX2. 
+
+// Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -485,20 +513,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
   v2 = _mm256_xor_si256( v1, v2 ); \
   v1 = _mm256_xor_si256( v1, v2 );

-#define mm256_vror512_128( v1, v2 ) \
-do { \
-   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
-   v2 = t; \
-} while(0)
-
-#define mm256_vrol512_128( v1, v2 ) \
-do { \
-   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
-   v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
-   v1 = t; \
-} while(0)
-
 #endif // __AVX2__
 #endif // SIMD_256_H__

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,14 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    cmp instruction now returns a bitmask isnstead of a vector mask.
+//    cmp instruction now returns a bitmask instead of a vector mask.
 //    This eliminates the need for the blendv instruction.
 //
 //    The new rotate instructions require the count to be an 8 bit
 //    immediate value only. Compilation fails if a variable is used.
 //    The documentation is the same as for shift and it works with
-//    variables.
+//    variables. The inconsistency is likely due to compiler optimizations
+//    that can eliminate the variable in some instances.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -317,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
 // elements and can be called directly. But they only accept immediate 8
 // for control arg. 
+// The workaround is a fraud, just a fluke of the compiler's optimizer.
+// It fails without -O3. The compiler seems to unroll shift loops, eliminating
+// the variable control, better than rotate loops. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -429,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)

-//
-// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
-//

-// rename plan change ror to vror for Vector ROtate Right,
-// and vrol for Vector ROtate Left, not to be confused with
-//variable rotate rorv, rolv,
-// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
-// operation. 1xNN notaion ia also removed and replaced with simpler NN.
-// Swap will still have its own mnemonic and will be aliased as both
-// left and right shuffles.
-
-// Shift elements right or left in 512 bit vector, filling with zeros.
-// Multiple element shifts can be combined into a single larger
-// element shift.
+// Cross-lane shuffles implementing rotate & shift of elements within a vector.
+//

 #define mm512_shiftr_256( v ) \
  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -493,7 +485,7 @@ static inline __m512i mm512_shufll_32( const __m512i v )
 static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }

-static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
+static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

 #define mm512_shuflr_16( v ) \
@@ -529,7 +521,7 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 // 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_swap256_128( v )      _mm512_permutex_epi64( v, 0x4e )
 #define mm512_shuflr256_128 mm512_swap256_128
 #define mm512_shufll256_128 mm512_swap256_128

@@ -581,8 +573,19 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )

 //
-// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 
+// Limited 2 input, 1 output shuffle, combines shuffle with blend.
+// Like most shuffles it's limited to 128 bit lanes and like some shuffles
+// destination elements must come from a specific source. 
+#define mm512_shuffle2_64( a, b, c ) \
+   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
+                                           _mm512_castsi512_pd( b ), c ) ); 
+
+#define mm512_shuffle2_32( a, b, c ) \
+   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
+                                           _mm512_castsi512_ps( b ), c ) ); 
+
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -610,11 +613,8 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions

-// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
-// rotated v1 
-// visually confusing for shif2r because of arg order. First arg is always
-// the target for modification, either update by reference or by function
-// return.
+// 2 input, 1 output
+// Rotate concatenated { v1, v2 ) right or left and return v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )

@@ -627,76 +627,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
 #define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )

-// Rotate elements from 2 512 bit vectors in place, source arguments
-//  are overwritten.
-
-#define mm512_swap1024_512( v1, v2 ) \
-   v1 = _mm512_xor_si512( v1, v2 ); \
-   v2 = _mm512_xor_si512( v1, v2 ); \
-   v1 = _mm512_xor_si512( v1, v2 );
-#define mm512_shufl2l_512 mm512_swap1024_512 \
-#define mm512_shufl2r_512 mm512_swap1024_512 \
-
-// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
-// for now.
-//  Rotate elements from 2 512 bit vectors in place, both source arguments
-//  are updated.
-
-#define mm512_vror1024_256( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_256( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_128( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_128( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_64( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
-   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_64( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
-   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
-   v1 = t; \
-} while(0)
-
-#define mm512_vror1024_32( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
-   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
-   v2 = t; \
-} while(0)
-
-#define mm512_vrol1024_32( v1, v2 ) \
-do { \
-   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
-   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
-   v1 = t; \
-} while(0)
-
 #endif // AVX512
 #endif // SIMD_512_H__
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -209,7 +209,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
 {
   memset(outbuf, 0, maxsz);
 #ifdef WIN32
-   char brand[0xC0] = { 0 };
+   char brand[256] = { 0 };
   int output[4] = { 0 }, ext;
   cpuid(0x80000000, output);
   ext = output[0];
@@ -502,6 +502,28 @@ static inline bool has_vaes()
 #endif
 }

+static inline bool has_vbmi()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
+#endif
+}
+
+static inline bool has_vbmi2()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
+#endif
+}
+
 // AMD only
 static inline bool has_xop()
 {
--- a/util.c
+++ b/util.c
@@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
 		free(sctx->url);
 		sctx->url = strdup(url);
 	}
-	free(sctx->curl_url);
+
+   free(sctx->curl_url);
 	sctx->curl_url = (char*) malloc(strlen(url));
-	sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) 
-                              ? strstr( url, "s://" )
-                              : strstr (url, "://"  ) );
+
+   // replace the stratum protocol prefix with http, https for ssl
+   sprintf( sctx->curl_url, "%s%s",
+            ( strstr( url, "s://" ) || strstr( url, "ssl://" ) )
+               ? "https" : "http", strstr( url, "://" ) );
+
+
+
+//   sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) 
+//                              ? strstr( url, "s://" )
+//                              : strstr (url, "://"  ) );

 	if (opt_protocol)
 		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
@@ -1658,7 +1667,7 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	pthread_mutex_unlock(&sctx->work_lock);

   if ( !opt_quiet ) /* pool dynamic change */
-      applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
+      applog( LOG_INFO, "Stratum extranonce1 0x%s, extranonce2 size %d",
         xnonce1, xn2_size);

 	return true;
@@ -1846,6 +1855,25 @@ out:
 	return ret;
 }

+bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff )
+{
+   char *s;
+   s = (char*) malloc( 80 );
+   bool rc = true;
+
+   // response is handled seperately, what ID?
+   sprintf( s, "{\"id\": 1, \"method\": \"mining.suggest_difficulty\", \"params\": [\"%f\"]}", diff );
+   if ( !stratum_send_line( sctx, s ) )
+   {
+      applog(LOG_WARNING,"stratum.suggest_difficulty send failed");
+      rc = false;
+   } 
+   free ( s );
+   return rc;
+}
+
+
+
 /**
 * Extract bloc height     L H... here len=3, height=0x1333e8
 * "...0000000000ffffffff2703e83313062f503253482f043d61105408"
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,8 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
-# support for Windows CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-O3 -maes -Wall -D_WIN32_WINNT=0x0601"
+# Support for Windows 7 CPU groups, AES sometimes not included in -march
+export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"

 # make link to local gmp header file.
@@ -26,8 +26,8 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 # make release directory and copy selected DLLs.

 rm -rf release > /dev/null
-
 mkdir release
+
 cp README.txt release/
 cp README.md release/
 cp RELEASE_NOTES release/
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake 
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 # AVX2 SHA VAES: Intel Alderlake, AMD Zen3
 make clean || echo done
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
+CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
 # AVX2 AES SHA: AMD Zen1
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
 # AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
 # AVX AES: Intel Sandybridge, Ivybridge
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
+CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # SSE4.2 AES: Intel Westmere
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
+CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
+CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe
 make clean || echo clean

+# Native with CPU groups ennabled
+make clean || echo clean
+rm -f config.status
+CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
+make -j 8
+strip -s cpuminer.exe
+
Author	SHA1	Message	Date
Jay D Dee	f552f2b1e8	v3.19.9	2022-07-10 11:04:00 -04:00
Jay D Dee	26b8927632	v3.19.8	2022-05-27 18:12:30 -04:00
Jay D Dee	db76d3865f	v3.19.7	2022-04-02 12:44:57 -04:00
Jay D Dee	5b678d2481	v3.19.6	2022-02-21 23:14:24 -05:00
Jay D Dee	90137b391e	v3.19.5	2022-01-30 20:59:54 -05:00
Jay D Dee	8727d79182	v3.19.4	2022-01-12 21:08:25 -05:00
Jay D Dee	17ccbc328f	v3.19.3	2022-01-07 12:07:38 -05:00