From 2cd1507c2e59c592f40be02d723a974644357808 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 29 Sep 2021 17:31:16 -0400
Subject: [PATCH] v3.7.4

---
 Makefile.am                                   |    3 +
 RELEASE_NOTES                                 |   31 +
 algo-gate-api.h                               |    4 +
 algo/argon2/argon2d/blake2/blamka-round-opt.h |   18 +-
 algo/bmw/bmw512-hash-4way.c                   |  471 +-
 algo/cubehash/cube-hash-2way.c                |  246 +-
 algo/cubehash/cube-hash-2way.h                |   44 +-
 algo/cubehash/cubehash_sse2.c                 |   12 +-
 algo/groestl/aes_ni/hash-groestl.h            |    3 +-
 algo/groestl/aes_ni/hash-groestl256.h         |    3 +-
 algo/groestl/myr-groestl.c                    |    7 +-
 algo/hamsi/hamsi-hash-4way.c                  |  218 +-
 algo/hodl/hodl-wolf.c                         |    1 +
 algo/keccak/keccak-gate.c                     |    1 +
 algo/keccak/keccak-hash-4way.c                |   14 +-
 algo/lyra2/allium-4way.c                      |   53 +-
 algo/lyra2/sponge.h                           |   30 +-
 algo/m7m/m7m.c                                |   24 +-
 algo/ripemd/lbry.c                            |   27 +-
 algo/scrypt/neoscrypt.c                       |    8 +-
 algo/scrypt/scrypt-core-4way.c                | 3981 +++++++++++++++++
 algo/scrypt/scrypt-core-4way.h                |   70 +
 algo/scrypt/scrypt-core-ref.c                 |  206 +
 algo/scrypt/scrypt.c                          | 1476 ++++--
 algo/sha/hmac-sha256-hash.c                   |   50 +-
 algo/sha/hmac-sha256-hash.h                   |    8 +-
 algo/sha/sha-hash-4way.h                      |   14 +-
 algo/sha/sha2.c                               |    8 +-
 algo/sha/sha256-hash-2way-ni.c                |  348 +-
 algo/sha/sha256-hash-4way.c                   |  473 +-
 algo/sha/sha256-hash-opt.c                    |  192 +-
 algo/sha/sha256-hash-opt.h                    |   18 -
 algo/sha/sha256-hash.c                        |  142 +
 algo/sha/sha256-hash.h                        |   56 +
 algo/sha/sha256d-4way.c                       |   31 +-
 algo/sha/sha256d.c                            |    8 +
 algo/sha/sha256d.h                            |    7 +
 algo/sha/sha256q.c                            |   30 +-
 algo/sha/sha256t-4way.c                       |   23 +-
 algo/sha/sha256t.c                            |  118 +-
 algo/sha/sha512-hash-4way.c                   |  150 +-
 algo/sha/sph_sha2.c                           |  210 +-
 algo/sha/sph_sha2.h                           |    7 +
 algo/shavite/shavite-hash-2way.c              |   52 +-
 algo/shavite/shavite-hash-4way.c              |   54 +-
 algo/shavite/sph-shavite-aesni.c              |   52 +-
 algo/skein/skein-4way.c                       |   21 +-
 algo/skein/skein.c                            |   13 +-
 algo/verthash/Verthash.c                      |    8 +-
 algo/verthash/verthash-gate.c                 |    4 +-
 algo/whirlpool/whirlpool.c                    |    2 +-
 algo/x16/x16r-4way.c                          |  143 +-
 algo/x16/x16r-gate.c                          |    1 +
 algo/x16/x16r-gate.h                          |    5 +-
 algo/x16/x21s-4way.c                          |   22 +-
 algo/x16/x21s.c                               |    8 +-
 algo/x17/x17-4way.c                           |    9 +-
 algo/x22/x22i-4way.c                          |   58 +-
 algo/x22/x22i.c                               |    6 +-
 algo/x22/x25x-4way.c                          |   56 +-
 algo/x22/x25x.c                               |    8 +-
 algo/yespower/crypto/blake2b-yp.c             |    8 +-
 algo/yespower/yescrypt-r8g.c                  |    4 +-
 algo/yespower/yespower-gate.c                 |   13 +-
 algo/yespower/yespower-opt.c                  |   19 +-
 algo/yespower/yespower.h                      |    6 +-
 build-allarch.sh                              |    2 +-
 configure                                     |   20 +-
 configure.ac                                  |    2 +-
 cpu-miner.c                                   |  136 +-
 miner.h                                       |   37 +-
 simd-utils.h                                  |    2 +
 simd-utils/intrlv.h                           |    2 +-
 simd-utils/simd-128.h                         |  177 +-
 simd-utils/simd-256.h                         |  159 +-
 simd-utils/simd-512.h                         |  204 +-
 simd-utils/simd-64.h                          |   10 +-
 simd-utils/simd-int.h                         |   13 +-
 sysinfos.c                                    |    2 +-
 util.c                                        |   60 +-
 80 files changed, 8145 insertions(+), 2097 deletions(-)
 create mode 100644 algo/scrypt/scrypt-core-4way.c
 create mode 100644 algo/scrypt/scrypt-core-4way.h
 create mode 100644 algo/scrypt/scrypt-core-ref.c
 delete mode 100644 algo/sha/sha256-hash-opt.h
 create mode 100644 algo/sha/sha256-hash.c
 create mode 100644 algo/sha/sha256-hash.h
 create mode 100644 algo/sha/sha256d.c
 create mode 100644 algo/sha/sha256d.h

diff --git a/Makefile.am b/Makefile.am
index a4adc3b..a4163b3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -158,7 +158,9 @@ cpuminer_SOURCES = \
   algo/ripemd/lbry.c \
   algo/ripemd/lbry-4way.c \
   algo/scrypt/scrypt.c \
+  algo/scrypt/scrypt-core-4way.c \
   algo/scrypt/neoscrypt.c \
+  algo/sha/sha256-hash.c \
   algo/sha/sph_sha2.c \
   algo/sha/sph_sha2big.c \
   algo/sha/sha256-hash-4way.c \
@@ -167,6 +169,7 @@ cpuminer_SOURCES = \
   algo/sha/sha256-hash-2way-ni.c \
   algo/sha/hmac-sha256-hash.c \
   algo/sha/hmac-sha256-hash-4way.c \
+  algo/sha/sha256d.c \
   algo/sha/sha2.c \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 3f6b080..056491f 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,37 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.18.0
+
+Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
+  - AVX512 & SHA support for SHA256, AVX512 has priority,
+  - up to 50% increase in hashrate,
+  - memory requirements reduced 30-60% depending on CPU architecture,
+  - memory usage displayed at startup,
+  - scrypt, default N=1024 (LTC), will likely perform slower.
+
+Improved stale share detection and handling for Scrypt with large N factor:
+  - abort and discard partially computed hash when new work is detected,
+  - quicker response to new job, less time wasted mining stale job.
+
+Improved stale share handling for all algorithms:
+  - report possible stale share when new work received with a previously
+    submitted share still pending,
+  - when new work is detected report the submission of an already completed,
+    otherwise valid, but likely stale, share,
+  - fixed incorrect block height in stale share log.
+
+Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
+
+When stratum disconnects miner threads go to idle until reconnected.
+
+Colour changes to some logs.
+
+Some low level function name changes for clarity and consistency.
+
+The reference hashrate in the summary log and the benchmark total hashrate
+are now the mean hashrate for the session. 
+
 v3.17.1
 
 Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 8d61d26..56594d5 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -1,3 +1,6 @@
+#ifndef __ALGO_GATE_API_H__
+#define __ALGO_GATE_API_H__ 1
+
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -319,3 +322,4 @@ void exec_hash_function( int algo, void *output, const void *pdata );
 // algo name if valid alias, NULL if invalid alias or algo.
 void get_algo_alias( char **algo_or_alias );
 
+#endif
diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
index 8156331..809961c 100644
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -328,7 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 
 #include <immintrin.h>
 
-#define ror64(x, n) _mm512_ror_epi64((x), (n))
+#define ROR64(x, n) _mm512_ror_epi64((x), (n))
 
 static __m512i muladd(__m512i x, __m512i y)
 {
@@ -344,8 +344,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ror64(D0, 32); \
-        D1 = ror64(D1, 32); \
+        D0 = ROR64(D0, 32); \
+        D1 = ROR64(D1, 32); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -353,8 +353,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ror64(B0, 24); \
-        B1 = ror64(B1, 24); \
+        B0 = ROR64(B0, 24); \
+        B1 = ROR64(B1, 24); \
     } while ((void)0, 0)
 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +365,8 @@ static __m512i muladd(__m512i x, __m512i y)
         D0 = _mm512_xor_si512(D0, A0); \
         D1 = _mm512_xor_si512(D1, A1); \
 \
-        D0 = ror64(D0, 16); \
-        D1 = ror64(D1, 16); \
+        D0 = ROR64(D0, 16); \
+        D1 = ROR64(D1, 16); \
 \
         C0 = muladd(C0, D0); \
         C1 = muladd(C1, D1); \
@@ -374,8 +374,8 @@ static __m512i muladd(__m512i x, __m512i y)
         B0 = _mm512_xor_si512(B0, C0); \
         B1 = _mm512_xor_si512(B1, C1); \
 \
-        B0 = ror64(B0, 63); \
-        B1 = ror64(B1, 63); \
+        B0 = ROR64(B0, 63); \
+        B1 = ROR64(B1, 63); \
     } while ((void)0, 0)
 
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c
index 3587cc4..9ab4f89 100644
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -594,22 +594,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
 #define rb6(x)    mm256_rol_64( x, 43 ) 
 #define rb7(x)    mm256_rol_64( x, 53 ) 
 
-#define rol_off_64( M, j, off ) \
-   mm256_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
+#define rol_off_64( M, j ) \
+   mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 
-#define add_elt_b( M, H, j ) \
-   _mm256_xor_si256( \
-      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
-                                                rol_off_64( M, j, 3 ) ), \
-                             rol_off_64( M, j, 10 ) ), \
-            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
+#define add_elt_b( mj0, mj3, mj10, h, K ) \
+  _mm256_xor_si256( h, _mm256_add_epi64( K, \
+              _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
 
-
-#define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( mm256_add4_64( \
+#define expand1_b( qt, i ) \
+   mm256_add4_64( \
       mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
                      sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
       mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
@@ -617,11 +610,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
       mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
                      sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
       mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
-                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
+                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) )
 
-#define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( mm256_add4_64( \
+#define expand2_b( qt, i) \
+   mm256_add4_64( \
       mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
                      qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
       mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
@@ -629,159 +621,98 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
       mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
                      qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
       mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
-                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-
+                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) )
 
 #define Wb0 \
    _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-         _mm256_xor_si256( M[10], H[10] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
+      _mm256_add_epi64( mh[13], mh[14] ) )
 
 #define Wb1 \
    _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                            _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
-                         _mm256_xor_si256( M[15], H[15] ) ) )
+       _mm256_add_epi64( _mm256_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
+       _mm256_sub_epi64( mh[14], mh[15] ) )
 
 #define Wb2 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
+      _mm256_sub_epi64( mh[12], mh[15] ) )
 
 #define Wb3 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
+      _mm256_sub_epi64( mh[10], \
+                        mh[13] ) )
 
 #define Wb4 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-         _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
+      _mm256_add_epi64( mh[11], mh[14] ) )
 
 #define Wb5 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                           _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-         _mm256_xor_si256( M[10], H[10] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
+      _mm256_sub_epi64( mh[12], mh[15] ) )
 
 #define Wb6 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-         _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
+      _mm256_sub_epi64( mh[11], mh[13] ) )
 
 #define Wb7 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-         _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
+      _mm256_add_epi64( mh[12], mh[14] ) )
 
 #define Wb8 \
    _mm256_add_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[13], mh[15] ) )
 
 #define Wb9 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-                        _mm256_xor_si256( M[14], H[14] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 7], mh[14] ) )
 
 #define Wb10 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                           _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-         _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
-                        _mm256_xor_si256( M[15], H[15] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
+      _mm256_sub_epi64( mh[ 7], mh[15] ) )
 
 #define Wb11 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                           _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-         _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                        _mm256_xor_si256( M[ 9], H[ 9] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
+      _mm256_sub_epi64( mh[ 5], mh[ 9] ) )
 
 #define Wb12 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                           _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-                        _mm256_xor_si256( M[10], H[10] ) ) )
+      _mm256_sub_epi64( _mm256_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 9], mh[10] ) )
 
 #define Wb13 \
    _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                           _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-         _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
-                        _mm256_xor_si256( M[11], H[11] ) ) )
+      _mm256_add_epi64( _mm256_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
+      _mm256_add_epi64( mh[10], mh[11] ) )
 
 #define Wb14 \
    _mm256_sub_epi64( \
-      _mm256_add_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                           _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-         _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-      _mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
-                        _mm256_xor_si256( M[12], H[12] ) ) )
+      _mm256_add_epi64( _mm256_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
+      _mm256_add_epi64( mh[11], mh[12] ) )
 
 #define Wb15 \
    _mm256_sub_epi64( \
-      _mm256_sub_epi64( \
-         _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                           _mm256_xor_si256( M[ 4], H[4] ) ), \
-         _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-      _mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
-                        _mm256_xor_si256( M[13], H[13] ) ) )
+      _mm256_sub_epi64( _mm256_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
+      _mm256_sub_epi64( mh[ 9], mh[13] ) )
 
 
 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
    __m256i qt[32], xl, xh;
+   __m256i mh[16];
+   int i;
+
+   for ( i = 0; i < 16; i++ )
+      mh[i] = _mm256_xor_si256( M[i], H[i] );
 
    qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); 
    qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); 
@@ -799,22 +730,60 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
    qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
    qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); 
    qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
-   qt[16] = expand1b( qt, M, H, 16 ); 
-   qt[17] = expand1b( qt, M, H, 17 ); 
-   qt[18] = expand2b( qt, M, H, 18 ); 
-   qt[19] = expand2b( qt, M, H, 19 ); 
-   qt[20] = expand2b( qt, M, H, 20 ); 
-   qt[21] = expand2b( qt, M, H, 21 ); 
-   qt[22] = expand2b( qt, M, H, 22 ); 
-   qt[23] = expand2b( qt, M, H, 23 ); 
-   qt[24] = expand2b( qt, M, H, 24 ); 
-   qt[25] = expand2b( qt, M, H, 25 ); 
-   qt[26] = expand2b( qt, M, H, 26 ); 
-   qt[27] = expand2b( qt, M, H, 27 ); 
-   qt[28] = expand2b( qt, M, H, 28 ); 
-   qt[29] = expand2b( qt, M, H, 29 ); 
-   qt[30] = expand2b( qt, M, H, 30 ); 
-   qt[31] = expand2b( qt, M, H, 31 ); 
+
+   __m256i mj[16];
+   for ( i = 0; i < 16; i++ )
+      mj[i] = rol_off_64( M, i );
+
+   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
+              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
+   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
+              (const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
+   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
+              (const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
+   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
+              (const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
+   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
+              (const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
+   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
+              (const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
+   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
+              (const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
+   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
+              (const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
+   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
+              (const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
+   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
+              (const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
+   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
+              (const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
+   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
+              (const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
+   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
+              (const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
+   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
+              (const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
+   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
+              (const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
+   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
+              (const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
+
+   qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
+   qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
+   qt[18] = _mm256_add_epi64( qt[18], expand2_b( qt, 18 ) );
+   qt[19] = _mm256_add_epi64( qt[19], expand2_b( qt, 19 ) );
+   qt[20] = _mm256_add_epi64( qt[20], expand2_b( qt, 20 ) );
+   qt[21] = _mm256_add_epi64( qt[21], expand2_b( qt, 21 ) );
+   qt[22] = _mm256_add_epi64( qt[22], expand2_b( qt, 22 ) );
+   qt[23] = _mm256_add_epi64( qt[23], expand2_b( qt, 23 ) );
+   qt[24] = _mm256_add_epi64( qt[24], expand2_b( qt, 24 ) );
+   qt[25] = _mm256_add_epi64( qt[25], expand2_b( qt, 25 ) );
+   qt[26] = _mm256_add_epi64( qt[26], expand2_b( qt, 26 ) );
+   qt[27] = _mm256_add_epi64( qt[27], expand2_b( qt, 27 ) );
+   qt[28] = _mm256_add_epi64( qt[28], expand2_b( qt, 28 ) );
+   qt[29] = _mm256_add_epi64( qt[29], expand2_b( qt, 29 ) );
+   qt[30] = _mm256_add_epi64( qt[30], expand2_b( qt, 30 ) );
+   qt[31] = _mm256_add_epi64( qt[31], expand2_b( qt, 31 ) );
 
    xl = _mm256_xor_si256(
            mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), 
@@ -823,7 +792,6 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
            mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
            mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
 
-
 #define DH1L( m, sl, sr, a, b, c ) \
    _mm256_add_epi64( \
                _mm256_xor_si256( M[m], \
@@ -1066,21 +1034,15 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define r8b6(x)    mm512_rol_64( x, 43 )
 #define r8b7(x)    mm512_rol_64( x, 53 )
 
-#define rol8w_off_64( M, j, off ) \
-   mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
+#define rol8w_off_64( M, j ) \
+   mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 
-#define add_elt_b8( M, H, j ) \
-   _mm512_xor_si512( \
-      _mm512_add_epi64( \
-            _mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
-                                                rol8w_off_64( M, j, 3 ) ), \
-                             rol8w_off_64( M, j, 10 ) ), \
-            _mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
+#define add_elt_b8( mj0, mj3, mj10, h, K ) \
+  _mm512_xor_si512( h, _mm512_add_epi64( K, \
+              _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
 
-#define expand1b8( qt, M, H, i ) \
-   _mm512_add_epi64( mm512_add4_64( \
+#define expand1_b8( qt, i ) \
+   mm512_add4_64( \
       mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
                      s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
       mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
@@ -1088,11 +1050,10 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
       mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
                      s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
       mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
-                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b8( M, H, (i)-16 ) )
+                     s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) )
 
-#define expand2b8( qt, M, H, i) \
-   _mm512_add_epi64( mm512_add4_64( \
+#define expand2_b8( qt, i) \
+   mm512_add4_64( \
       mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
                      qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
       mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
@@ -1100,157 +1061,97 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
       mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
                      qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
       mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
-                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
-      add_elt_b8( M, H, (i)-16 ) )
+                     s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) )
 
 #define W8b0 \
    _mm512_add_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
-                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-         _mm512_xor_si512( M[10], H[10] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 5], mh[ 7] ), mh[10] ), \
+      _mm512_add_epi64( mh[13], mh[14] ) )
 
 #define W8b1 \
    _mm512_add_epi64( \
-       _mm512_add_epi64( \
-          _mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
-                            _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-          _mm512_xor_si512( M[11], H[11] ) ), \
-       _mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
-                         _mm512_xor_si512( M[15], H[15] ) ) )
+         _mm512_add_epi64( _mm512_sub_epi64( mh[ 6], mh[ 8] ), mh[11] ), \
+         _mm512_sub_epi64( mh[14], mh[15] ) )
 
 #define W8b2 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 0], mh[ 7] ), mh[ 9] ), \
+      _mm512_sub_epi64( mh[12], mh[15] ) )
 
 #define W8b3 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
-         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 1] ), mh[ 8] ), \
+      _mm512_sub_epi64( mh[10], mh[13] ) )
 
 #define W8b4 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-         _mm512_xor_si512( M[ 9], H[ 9] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 1], mh[ 2] ), mh[ 9] ), \
+      _mm512_add_epi64( mh[11], mh[14] ) )
 
 #define W8b5 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
-                           _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-         _mm512_xor_si512( M[10], H[10] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 2] ), mh[10] ), \
+      _mm512_sub_epi64( mh[12], mh[15] ) )
 
 #define W8b6 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
-                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
-         _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+         _mm512_sub_epi64( _mm512_sub_epi64( mh[ 4], mh[ 0] ), mh[ 3] ), \
+      _mm512_sub_epi64( mh[11], mh[13] ) )
 
 #define W8b7 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-         _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 1], mh[ 4] ), mh[ 5] ), \
+      _mm512_add_epi64( mh[12], mh[14] ) )
 
 #define W8b8 \
    _mm512_add_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
-                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 2], mh[ 5] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[13], mh[15] ) )
 
 #define W8b9 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
-                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
-                        _mm512_xor_si512( M[14], H[14] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 0], mh[ 3] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 7], mh[14] ) )
 
 #define W8b10 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
-                           _mm512_xor_si512( M[ 1], H[ 1] ) ), \
-         _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
-                        _mm512_xor_si512( M[15], H[15] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 1] ), mh[ 4] ), \
+      _mm512_sub_epi64( mh[ 7], mh[15] ) )
 
 #define W8b11 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
-                           _mm512_xor_si512( M[ 0], H[ 0] ) ), \
-         _mm512_xor_si512( M[ 2], H[ 2] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
-                        _mm512_xor_si512( M[ 9], H[ 9] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[ 8], mh[ 0] ), mh[ 2] ), \
+      _mm512_sub_epi64( mh[ 5], mh[ 9] ) )
 
 #define W8b12 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
-                           _mm512_xor_si512( M[ 3], H[ 3] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
-                        _mm512_xor_si512( M[10], H[10] ) ) )
+      _mm512_sub_epi64( _mm512_add_epi64( mh[ 1], mh[ 3] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 9], mh[10] ) )
 
 #define W8b13 \
    _mm512_add_epi64( \
-      _mm512_add_epi64( \
-         _mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
-                           _mm512_xor_si512( M[ 4], H[ 4] ) ), \
-         _mm512_xor_si512( M[ 7], H[ 7] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
-                        _mm512_xor_si512( M[11], H[11] ) ) )
+      _mm512_add_epi64( _mm512_add_epi64( mh[ 2], mh[ 4] ), mh[ 7] ), \
+      _mm512_add_epi64( mh[10], mh[11] ) )
 
 #define W8b14 \
    _mm512_sub_epi64( \
-      _mm512_add_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
-                           _mm512_xor_si512( M[ 5], H[ 5] ) ), \
-         _mm512_xor_si512( M[ 8], H[ 8] ) ), \
-      _mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
-                        _mm512_xor_si512( M[12], H[12] ) ) )
+      _mm512_add_epi64( _mm512_sub_epi64( mh[ 3], mh[ 5] ), mh[ 8] ), \
+      _mm512_add_epi64( mh[11], mh[12] ) )
 
 #define W8b15 \
    _mm512_sub_epi64( \
-      _mm512_sub_epi64( \
-         _mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
-                           _mm512_xor_si512( M[ 4], H[4] ) ), \
-         _mm512_xor_si512( M[ 6], H[ 6] ) ), \
-      _mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
-                        _mm512_xor_si512( M[13], H[13] ) ) )
+      _mm512_sub_epi64( _mm512_sub_epi64( mh[12], mh[ 4] ), mh[ 6] ), \
+      _mm512_sub_epi64( mh[ 9], mh[13] ) )
 
 void compress_big_8way( const __m512i *M, const __m512i H[16],
                         __m512i dH[16] )
 {
    __m512i qt[32], xl, xh;
+   __m512i mh[16];
+   int i;
+
+   for ( i = 0; i < 16; i++ )
+      mh[i] = _mm512_xor_si512( M[i], H[i] );
 
    qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
    qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
@@ -1268,22 +1169,60 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
    qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
    qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
    qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
-   qt[16] = expand1b8( qt, M, H, 16 );
-   qt[17] = expand1b8( qt, M, H, 17 );
-   qt[18] = expand2b8( qt, M, H, 18 );
-   qt[19] = expand2b8( qt, M, H, 19 );
-   qt[20] = expand2b8( qt, M, H, 20 );
-   qt[21] = expand2b8( qt, M, H, 21 );
-   qt[22] = expand2b8( qt, M, H, 22 );
-   qt[23] = expand2b8( qt, M, H, 23 );
-   qt[24] = expand2b8( qt, M, H, 24 );
-   qt[25] = expand2b8( qt, M, H, 25 );
-   qt[26] = expand2b8( qt, M, H, 26 );
-   qt[27] = expand2b8( qt, M, H, 27 );
-   qt[28] = expand2b8( qt, M, H, 28 );
-   qt[29] = expand2b8( qt, M, H, 29 );
-   qt[30] = expand2b8( qt, M, H, 30 );
-   qt[31] = expand2b8( qt, M, H, 31 );
+
+   __m512i mj[16];
+   for ( i = 0; i < 16; i++ )
+      mj[i] = rol8w_off_64( M, i );
+
+   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
+              (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
+   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
+              (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
+   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
+              (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
+   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
+              (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
+   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
+              (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
+   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
+              (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
+   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
+              (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
+   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
+              (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
+   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
+              (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
+   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
+              (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
+   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
+              (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
+   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
+              (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
+   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
+              (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
+   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
+              (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
+   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
+              (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
+   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
+              (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
+
+   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
+   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
+   qt[18] = _mm512_add_epi64( qt[18], expand2_b8( qt, 18 ) );
+   qt[19] = _mm512_add_epi64( qt[19], expand2_b8( qt, 19 ) );
+   qt[20] = _mm512_add_epi64( qt[20], expand2_b8( qt, 20 ) );
+   qt[21] = _mm512_add_epi64( qt[21], expand2_b8( qt, 21 ) );
+   qt[22] = _mm512_add_epi64( qt[22], expand2_b8( qt, 22 ) );
+   qt[23] = _mm512_add_epi64( qt[23], expand2_b8( qt, 23 ) );
+   qt[24] = _mm512_add_epi64( qt[24], expand2_b8( qt, 24 ) );
+   qt[25] = _mm512_add_epi64( qt[25], expand2_b8( qt, 25 ) );
+   qt[26] = _mm512_add_epi64( qt[26], expand2_b8( qt, 26 ) );
+   qt[27] = _mm512_add_epi64( qt[27], expand2_b8( qt, 27 ) );
+   qt[28] = _mm512_add_epi64( qt[28], expand2_b8( qt, 28 ) );
+   qt[29] = _mm512_add_epi64( qt[29], expand2_b8( qt, 29 ) );
+   qt[30] = _mm512_add_epi64( qt[30], expand2_b8( qt, 30 ) );
+   qt[31] = _mm512_add_epi64( qt[31], expand2_b8( qt, 31 ) );
 
    xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
                     mm512_xor3( qt[19], qt[20], qt[21] ),
diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c
index 1201b8f..06f7e09 100644
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -98,6 +98,138 @@ static void transform_4way( cube_4way_context *sp )
     _mm512_store_si512( (__m512i*)sp->h + 7, x7 );
 }
 
+// 8 ways, 4 way parallel double buffered
+static void transform_4way_2buf( cube_4way_2buf_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7;
+    __m512i y0, y1, y2, y3, y4, y5, y6, y7;
+    __m512i tx0, tx1, ty0, ty1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->h0     );
+    x1 = _mm512_load_si512( (__m512i*)sp->h0 + 1 );
+    x2 = _mm512_load_si512( (__m512i*)sp->h0 + 2 );
+    x3 = _mm512_load_si512( (__m512i*)sp->h0 + 3 );
+    x4 = _mm512_load_si512( (__m512i*)sp->h0 + 4 );
+    x5 = _mm512_load_si512( (__m512i*)sp->h0 + 5 );
+    x6 = _mm512_load_si512( (__m512i*)sp->h0 + 6 );
+    x7 = _mm512_load_si512( (__m512i*)sp->h0 + 7 );
+
+    y0 = _mm512_load_si512( (__m512i*)sp->h1     );
+    y1 = _mm512_load_si512( (__m512i*)sp->h1 + 1 );
+    y2 = _mm512_load_si512( (__m512i*)sp->h1 + 2 );
+    y3 = _mm512_load_si512( (__m512i*)sp->h1 + 3 );
+    y4 = _mm512_load_si512( (__m512i*)sp->h1 + 4 );
+    y5 = _mm512_load_si512( (__m512i*)sp->h1 + 5 );
+    y6 = _mm512_load_si512( (__m512i*)sp->h1 + 6 );
+    y7 = _mm512_load_si512( (__m512i*)sp->h1 + 7 );
+
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm512_add_epi32( x0, x4 );
+        y4 = _mm512_add_epi32( y0, y4 );
+        tx0 = x0;
+        ty0 = y0;
+        x5 = _mm512_add_epi32( x1, x5 );
+        y5 = _mm512_add_epi32( y1, y5 );
+        tx1 = x1;
+        ty1 = y1;
+        x0 = mm512_rol_32( x2, 7 );
+        y0 = mm512_rol_32( y2, 7 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        y6 = _mm512_add_epi32( y2, y6 );
+        x1 = mm512_rol_32( x3, 7 );
+        y1 = mm512_rol_32( y3, 7 );
+        x7 = _mm512_add_epi32( x3, x7 );
+        y7 = _mm512_add_epi32( y3, y7 );
+
+
+        x2 = mm512_rol_32( tx0, 7 );
+        y2 = mm512_rol_32( ty0, 7 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        y0 = _mm512_xor_si512( y0, y4 );
+        x4 = mm512_swap128_64( x4 );
+        x3 = mm512_rol_32( tx1, 7 );
+        y3 = mm512_rol_32( ty1, 7 );
+        y4 = mm512_swap128_64( y4 );
+
+        x1 = _mm512_xor_si512( x1, x5 );
+        y1 = _mm512_xor_si512( y1, y5 );
+        x5 = mm512_swap128_64( x5 );
+        x2 = _mm512_xor_si512( x2, x6 );
+        y2 = _mm512_xor_si512( y2, y6 );
+        y5 = mm512_swap128_64( y5 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        y3 = _mm512_xor_si512( y3, y7 );
+
+        x6 = mm512_swap128_64( x6 );
+        x4 = _mm512_add_epi32( x0, x4 );
+        y4 = _mm512_add_epi32( y0, y4 );
+        y6 = mm512_swap128_64( y6 );
+        x5 = _mm512_add_epi32( x1, x5 );
+        y5 = _mm512_add_epi32( y1, y5 );
+        x7 = mm512_swap128_64( x7 );
+        x6 = _mm512_add_epi32( x2, x6 );
+        y6 = _mm512_add_epi32( y2, y6 );
+        tx0 = x0;
+        ty0 = y0;
+        y7 = mm512_swap128_64( y7 );
+        tx1 = x2;
+        ty1 = y2;
+        x0 = mm512_rol_32( x1, 11 );
+        y0 = mm512_rol_32( y1, 11 );
+
+        x7 = _mm512_add_epi32( x3, x7 );
+        y7 = _mm512_add_epi32( y3, y7 );
+
+        x1 = mm512_rol_32( tx0, 11 );
+        y1 = mm512_rol_32( ty0, 11 );
+        x0 = _mm512_xor_si512( x0, x4 );
+        x4 = mm512_swap64_32( x4 );
+        y0 = _mm512_xor_si512( y0, y4 );
+        x2 = mm512_rol_32( x3, 11 );
+        y4 = mm512_swap64_32( y4 );
+        y2 = mm512_rol_32( y3, 11 );
+        x1 = _mm512_xor_si512( x1, x5 );
+        x5 = mm512_swap64_32( x5 );
+        y1 = _mm512_xor_si512( y1, y5 );
+        x3 = mm512_rol_32( tx1, 11 );
+        y5 = mm512_swap64_32( y5 );
+        y3 = mm512_rol_32( ty1, 11 );
+
+        x2 = _mm512_xor_si512( x2, x6 );
+        x6 = mm512_swap64_32( x6 );
+        y2 = _mm512_xor_si512( y2, y6 );
+        y6 = mm512_swap64_32( y6 );
+        x3 = _mm512_xor_si512( x3, x7 );
+        x7 = mm512_swap64_32( x7 );
+        y3 = _mm512_xor_si512( y3, y7 );
+
+        y7 = mm512_swap64_32( y7 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->h0,     x0 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 1, x1 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 2, x2 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 3, x3 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 4, x4 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 5, x5 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 6, x6 );
+    _mm512_store_si512( (__m512i*)sp->h0 + 7, x7 );
+
+    _mm512_store_si512( (__m512i*)sp->h1,     y0 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 1, y1 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 2, y2 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 3, y3 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 4, y4 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 5, y5 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 6, y6 );
+    _mm512_store_si512( (__m512i*)sp->h1 + 7, y7 );
+}
+
 int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
                     int blockbytes )
 {
@@ -219,6 +351,67 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
     return 0;
 }
 
+int cube_4way_2buf_full( cube_4way_2buf_context *sp,
+                         void *output0, void *output1, int hashbitlen,
+                         const void *data0, const void *data1, size_t size )
+{
+    __m512i *h0 = (__m512i*)sp->h0;
+    __m512i *h1 = (__m512i*)sp->h1;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h1[0] = h0[0] = m512_const1_128( iv[0] );
+    h1[1] = h0[1] = m512_const1_128( iv[1] );
+    h1[2] = h0[2] = m512_const1_128( iv[2] );
+    h1[3] = h0[3] = m512_const1_128( iv[3] );
+    h1[4] = h0[4] = m512_const1_128( iv[4] );
+    h1[5] = h0[5] = m512_const1_128( iv[5] );
+    h1[6] = h0[6] = m512_const1_128( iv[6] );
+    h1[7] = h0[7] = m512_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m512i *in0 = (__m512i*)data0;
+    const __m512i *in1 = (__m512i*)data1;
+    __m512i *hash0 = (__m512i*)output0;
+    __m512i *hash1 = (__m512i*)output1;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], in0[i] );
+        sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], in1[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way_2buf( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    __m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
+    sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
+    sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );
+
+    transform_4way_2buf( sp );
+
+    tmp = m512_const2_64( 0x0000000100000000, 0 );
+    sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
+    sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way_2buf( sp );
+
+    memcpy( hash0, sp->h0, sp->hashlen<<6);
+    memcpy( hash1, sp->h1, sp->hashlen<<6);
+
+    return 0;
+}
+
 
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                                const void *data, size_t size )
@@ -259,6 +452,21 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
 
 // 2 way 128 
 
+// This isn't expected to be used with AVX512 so HW rotate intruction
+// is assumed not avaiable.
+// Use double buffering to optimize serial bit rotations. Full double
+// buffering isn't practical because it needs twice as many registers
+// with AVX2 having only half as many as AVX512.
+#define ROL2( out0, out1, in0, in1, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi32( in0, c ); \
+ __m256i t1 = _mm256_slli_epi32( in1, c ); \
+ out0 = _mm256_srli_epi32( in0, 32-(c) ); \
+ out1 = _mm256_srli_epi32( in1, 32-(c) ); \
+ out0 = _mm256_or_si256( out0, t0 ); \
+ out1 = _mm256_or_si256( out1, t1 ); \
+}
+
 static void transform_2way( cube_2way_context *sp )
 {
     int r;
@@ -283,35 +491,31 @@ static void transform_2way( cube_2way_context *sp )
         x7 = _mm256_add_epi32( x3, x7 );
         y0 = x0;
         y1 = x1;
-        x0 = mm256_rol_32( x2, 7 );
-        x1 = mm256_rol_32( x3, 7 );
-        x2 = mm256_rol_32( y0, 7 );
-        x3 = mm256_rol_32( y1, 7 );
+        ROL2( x0, x1, x2, x3, 7 );
+        ROL2( x2, x3, y0, y1, 7 );
         x0 = _mm256_xor_si256( x0, x4 );
-        x1 = _mm256_xor_si256( x1, x5 );
-        x2 = _mm256_xor_si256( x2, x6 );
-        x3 = _mm256_xor_si256( x3, x7 );
         x4 = mm256_swap128_64( x4 );
-        x5 = mm256_swap128_64( x5 );
-        x6 = mm256_swap128_64( x6 );
-        x7 = mm256_swap128_64( x7 );
-        x4 = _mm256_add_epi32( x0, x4 );
-        x5 = _mm256_add_epi32( x1, x5 );
-        x6 = _mm256_add_epi32( x2, x6 );
-        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x0;
-        y1 = x2;
-        x0 = mm256_rol_32( x1, 11 );
-        x1 = mm256_rol_32( y0, 11 );
-        x2 = mm256_rol_32( x3, 11 );
-        x3 = mm256_rol_32( y1, 11 );
-        x0 = _mm256_xor_si256( x0, x4 );
         x1 = _mm256_xor_si256( x1, x5 );
         x2 = _mm256_xor_si256( x2, x6 );
+        x5 = mm256_swap128_64( x5 );
         x3 = _mm256_xor_si256( x3, x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x6 = mm256_swap128_64( x6 );
+        y0 = x0;
+        x5 = _mm256_add_epi32( x1, x5 );
+        x7 = mm256_swap128_64( x7 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        y1 = x2;
+        ROL2( x0, x1, x1, y0, 11 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        ROL2( x2, x3, x3, y1, 11 );
+        x0 = _mm256_xor_si256( x0, x4 );
         x4 = mm256_swap64_32( x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
         x5 = mm256_swap64_32( x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
         x6 = mm256_swap64_32( x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
         x7 = mm256_swap64_32( x7 );
     }
 
diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h
index 25df10e..a31ffde 100644
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -17,41 +17,41 @@ struct _cube_4way_context
     int pos; 
 } __attribute__ ((aligned (128)));
 
+struct _cube_4way_2buf_context
+{
+    __m512i h0[8];
+    __m512i h1[8];
+    int hashlen;
+    int rounds;
+    int blocksize;
+    int pos;
+} __attribute__ ((aligned (128)));
+
+
 typedef struct _cube_4way_context cube_4way_context;
 
+typedef struct _cube_4way_2buf_context cube_4way_2buf_context;
+
 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
-                       int blockbytes );
+                    int blockbytes );
+
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
 int cube_4way_close( cube_4way_context *sp, void *output );
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                             const void *data, size_t size );
+
 int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
                     const void *data, size_t size );
 
-int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
-                     const void *data, size_t size );
-
-#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
-#define cube512_4way_update cube_4way_update
-#define cube512_4way_update_close cube_4way_update
-#define cube512_4way_close cube_4way_update
-#define cube512_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 512, data, size )
-#define cube512_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 512, data, size )
-
-#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
-#define cube256_4way_update cube_4way_update
-#define cube256_4way_update_close cube_4way_update
-#define cube256_4way_close cube_4way_update
-#define cube256_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 256, data, size )
-#define cube256_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 256, data, size )
+int cube_4way_2buf_full( cube_4way_2buf_context *sp,
+                         void *output0, void *output1, int hashbitlen,
+                         const void *data0, const void *data1, size_t size );
 
 #endif
 
-// 2x128, 2 way parallel SSE2
+// 2x128, 2 way parallel AVX2
 
 struct _cube_2way_context
 {
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index c87829d..5ea1b6f 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -31,10 +31,14 @@ static void transform( cubehashParam *sp )
     for ( r = 0; r < rounds; ++r )
     { 
         x1 = _mm512_add_epi32( x0, x1 );
-        x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
-        x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
-        x0 = _mm512_xor_si512( mm512_rol_32(
-                                         mm512_swap256_128( x0 ), 11 ), x1 );
+        x0 = mm512_swap_256( x0 );
+        x0 = mm512_rol_32(  x0, 7 );
+        x0 = _mm512_xor_si512( x0, x1 );
+        x1 = mm512_swap128_64( x1 );
+        x1 = _mm512_add_epi32( x0, x1 );
+        x0 = mm512_swap256_128( x0 );
+        x0 = mm512_rol_32( x0, 11 );
+        x0 = _mm512_xor_si512( x0, x1 );
         x1 = mm512_swap64_32( x1 );
     }
 
diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h
index 595dc3d..b76d809 100644
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -43,7 +43,8 @@
 #define ROUNDS (ROUNDS1024)
 //#endif
 
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+#define ROTL64(a,n) rol64( a, n )
 
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h
index 9410266..32ce1a5 100644
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -63,7 +63,8 @@ typedef crypto_uint64 u64;
 //#define ROUNDS (ROUNDS1024)
 //#endif
 
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+//#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+#define ROTL64(a,n) rol64( a, n )
 
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c
index 5a67303..4f17c64 100644
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -11,7 +11,7 @@
 #else
   #include "sph_groestl.h"
 #endif
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 typedef struct {
 #ifdef __AES__
@@ -19,7 +19,6 @@ typedef struct {
 #else
     sph_groestl512_context  groestl;
 #endif
-    sph_sha256_context      sha;
 } myrgr_ctx_holder;
 
 myrgr_ctx_holder myrgr_ctx;
@@ -31,7 +30,6 @@ void init_myrgr_ctx()
 #else
      sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
-     sph_sha256_init( &myrgr_ctx.sha );
 }
 
 void myriad_hash(void *output, const void *input)
@@ -49,8 +47,7 @@ void myriad_hash(void *output, const void *input)
    sph_groestl512_close(&ctx.groestl, hash);
 #endif
 
-   sph_sha256( &ctx.sha, hash, 64 );
-   sph_sha256_close( &ctx.sha, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy(output, hash, 32);
 }
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 9944ebe..26e133c 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -632,26 +632,25 @@ do { \
 } while (0)
 
 
-#define ROUND_BIG8(rc, alpha) \
+#define ROUND_BIG8( alpha ) \
 do { \
    __m512i t0, t1, t2, t3; \
-   s0 = _mm512_xor_si512( s0, m512_const1_64( \
-                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+   s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
+   s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
+   s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
+   s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
+   s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
+   s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
+   s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
+   s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
+   s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
+   s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
+   sA = _mm512_xor_si512( sA, alpha[10] ); \
+   sB = _mm512_xor_si512( sB, alpha[11] ); \
+   sC = _mm512_xor_si512( sC, alpha[12] ); \
+   sD = _mm512_xor_si512( sD, alpha[13] ); \
+   sE = _mm512_xor_si512( sE, alpha[14] ); \
+   sF = _mm512_xor_si512( sF, alpha[15] ); \
 \
   SBOX8( s0, s4, s8, sC ); \
   SBOX8( s1, s5, s9, sD ); \
@@ -731,28 +730,66 @@ do { \
 
 #define P_BIG8 \
 do { \
-   ROUND_BIG8(0, alpha_n); \
-   ROUND_BIG8(1, alpha_n); \
-   ROUND_BIG8(2, alpha_n); \
-   ROUND_BIG8(3, alpha_n); \
-   ROUND_BIG8(4, alpha_n); \
-   ROUND_BIG8(5, alpha_n); \
+   __m512i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG8( alpha ); \
 } while (0)
 
 #define PF_BIG8 \
 do { \
-   ROUND_BIG8( 0, alpha_f); \
-   ROUND_BIG8( 1, alpha_f); \
-   ROUND_BIG8( 2, alpha_f); \
-   ROUND_BIG8( 3, alpha_f); \
-   ROUND_BIG8( 4, alpha_f); \
-   ROUND_BIG8( 5, alpha_f); \
-   ROUND_BIG8( 6, alpha_f); \
-   ROUND_BIG8( 7, alpha_f); \
-   ROUND_BIG8( 8, alpha_f); \
-   ROUND_BIG8( 9, alpha_f); \
-   ROUND_BIG8(10, alpha_f); \
-   ROUND_BIG8(11, alpha_f); \
+   __m512i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
+   alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG8( alpha ); \
 } while (0)
 
 #define T_BIG8 \
@@ -965,26 +1002,25 @@ do { \
 #define sF   m7
 */
 
-#define ROUND_BIG(rc, alpha) \
+#define ROUND_BIG( alpha ) \
 do { \
    __m256i t0, t1, t2, t3; \
-   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+   s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
+   s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
+   s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
+   s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
+   s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
+   s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
+   s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
+   s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
+   s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
+   s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
+   sA = _mm256_xor_si256( sA, alpha[10] ); \
+   sB = _mm256_xor_si256( sB, alpha[11] ); \
+   sC = _mm256_xor_si256( sC, alpha[12] ); \
+   sD = _mm256_xor_si256( sD, alpha[13] ); \
+   sE = _mm256_xor_si256( sE, alpha[14] ); \
+   sF = _mm256_xor_si256( sF, alpha[15] ); \
 \
   SBOX( s0, s4, s8, sC ); \
   SBOX( s1, s5, s9, sD ); \
@@ -1064,28 +1100,66 @@ do { \
 
 #define P_BIG \
 do { \
-   ROUND_BIG(0, alpha_n); \
-   ROUND_BIG(1, alpha_n); \
-   ROUND_BIG(2, alpha_n); \
-   ROUND_BIG(3, alpha_n); \
-   ROUND_BIG(4, alpha_n); \
-   ROUND_BIG(5, alpha_n); \
+   __m256i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_n )[0] ); \
+   ROUND_BIG( alpha ); \
 } while (0)
 
 #define PF_BIG \
 do { \
-   ROUND_BIG( 0, alpha_f); \
-   ROUND_BIG( 1, alpha_f); \
-   ROUND_BIG( 2, alpha_f); \
-   ROUND_BIG( 3, alpha_f); \
-   ROUND_BIG( 4, alpha_f); \
-   ROUND_BIG( 5, alpha_f); \
-   ROUND_BIG( 6, alpha_f); \
-   ROUND_BIG( 7, alpha_f); \
-   ROUND_BIG( 8, alpha_f); \
-   ROUND_BIG( 9, alpha_f); \
-   ROUND_BIG(10, alpha_f); \
-   ROUND_BIG(11, alpha_f); \
+   __m256i alpha[16]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
+   alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
+                            ^ ( (uint64_t*)alpha_f )[0] ); \
+   ROUND_BIG( alpha ); \
 } while (0)
 
 #define T_BIG \
diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c
index 6ff6175..7ce79da 100644
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -7,6 +7,7 @@
 #include "hodl-gate.h"
 #include "hodl-wolf.h"
 #include "miner.h"
+#include "algo/sha/sha256d.h"
 
 #if defined(__AES__)               
 
diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c
index 282ae91..c710836 100644
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,5 +1,6 @@
 #include "keccak-gate.h"
 #include "sph_keccak.h"
+#include "algo/sha/sha256d.h"
 
 int hard_coded_eb = 1;
 
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index e2545b4..af37d6f 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -70,13 +70,13 @@ static const uint64_t RC[] = {
 
 // Targetted macros, keccak-macros.h is included for each target.
 
-#define DECL64(x)        __m512i x
-#define XOR64(d, a, b)   (d = _mm512_xor_si512(a,b))
-#define AND64(d, a, b)   (d = _mm512_and_si512(a,b))
-#define OR64(d, a, b)    (d = _mm512_or_si512(a,b))
-#define NOT64(d, s)      (d = _mm512_xor_si512(s,m512_neg1))
-#define ROL64(d, v, n)   (d = mm512_rol_64(v, n))
-#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
+#define DECL64(x)          __m512i x
+#define XOR64(d, a, b)     (d = _mm512_xor_si512(a,b))
+#define AND64(d, a, b)     (d = _mm512_and_si512(a,b))
+#define OR64(d, a, b)      (d = _mm512_or_si512(a,b))
+#define NOT64(d, s)        (d = _mm512_xor_si512(s,m512_neg1))
+#define ROL64(d, v, n)     (d = mm512_rol_64(v, n))
+#define XOROR(d, a, b, c)  (d = mm512_xoror(a, b, c))
 #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
 
 
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index 833b87e..f15648a 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -16,7 +16,7 @@
 typedef struct {
    blake256_16way_context     blake;
    keccak256_8way_context    keccak;
-   cube_4way_context          cube;
+   cube_4way_2buf_context    cube;
    skein256_8way_context     skein;
 #if defined(__VAES__)
    groestl256_4way_context groestl;
@@ -30,13 +30,7 @@ static __thread allium_16way_ctx_holder allium_16way_ctx;
 bool init_allium_16way_ctx()
 {
    keccak256_8way_init( &allium_16way_ctx.keccak );
-   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
    skein256_8way_init( &allium_16way_ctx.skein );
-#if defined(__VAES__)
-   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
-#else
-   init_groestl256( &allium_16way_ctx.groestl, 32 );
-#endif
    return true;
 }
 
@@ -111,12 +105,11 @@ void allium_16way_hash( void *state, const void *input )
    intrlv_2x256( vhash, hash14, hash15, 256 );
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash14, hash15, vhash, 256 );
-  
+
    intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
    intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
 
-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
 
    dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
    dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -124,8 +117,7 @@ void allium_16way_hash( void *state, const void *input )
    intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
    intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
 
-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 256, vhashA, vhashB, 32 );
 
    dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
    dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -255,7 +247,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 typedef struct {
    blake256_8way_context     blake;
    keccak256_4way_context    keccak;
-   cubehashParam             cube;
+   cube_2way_context         cube;
    skein256_4way_context     skein;
 #if defined(__VAES__)
    groestl256_2way_context   groestl;
@@ -269,13 +261,7 @@ static __thread allium_8way_ctx_holder allium_8way_ctx;
 bool init_allium_8way_ctx()
 {
    keccak256_4way_init( &allium_8way_ctx.keccak );
-   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
    skein256_4way_init( &allium_8way_ctx.skein );
-#if defined(__VAES__)
-   groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
-#else
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
-#endif
    return true;
 }
 
@@ -320,21 +306,20 @@ void allium_8way_hash( void *hash, const void *input )
    LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
    LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
 
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
+
+   intrlv_2x128( vhashA, hash0, hash1, 256 );
+   intrlv_2x128( vhashB, hash2, hash3, 256 );
+   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   dintrlv_2x128( hash0, hash1, vhashA, 256 );
+   dintrlv_2x128( hash2, hash3, vhashB, 256 );
+
+   intrlv_2x128( vhashA, hash4, hash5, 256 );
+   intrlv_2x128( vhashB, hash6, hash7, 256 );
+   cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_2way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   dintrlv_2x128( hash4, hash5, vhashA, 256 );
+   dintrlv_2x128( hash6, hash7, vhashB, 256 );
 
    LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
    LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index b24b173..1c90444 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -66,13 +66,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
    G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror256_64( s1); \
+   s3 = mm512_shufll256_64( s3 ); \
+   s1 = mm512_shuflr256_64( s1); \
    s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_rol256_64( s3 ); \
    G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_ror256_64( s3 );
+   s3 = mm512_shuflr256_64( s3 ); \
+   s1 = mm512_shufll256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); 
 
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
    LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -107,13 +107,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
    G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_ror_1x64( s1); \
+   s3 = mm256_shufll_64( s3 ); \
+   s1 = mm256_shuflr_64( s1); \
    s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rol_1x64( s3 ); \
    G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rol_1x64( s1 ); \
-   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_ror_1x64( s3 );
+   s3 = mm256_shuflr_64( s3 ); \
+   s1 = mm256_shufll_64( s1 ); \
+   s2 = mm256_swap_128( s2 );
 
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
    LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror256_64( s2, s3 ); \
+   mm128_vrol256_64( s6, s7 ); \
+   mm128_vror256_64( s2, s3 ); \
    mm128_swap256_128( s4, s5 ); \
-   mm128_rol256_64( s6, s7 ); \
    G_2X64( s0, s2, s4, s6 ); \
    G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   mm128_ror256_64( s6, s7 );
+   mm128_vror256_64( s6, s7 ); \
+   mm128_vrol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 );
 
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
    LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c
index ab13a7e..2bf4a11 100644
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -13,6 +13,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/ripemd/sph_ripemd.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #define EPSa DBL_EPSILON
 #define EPS1 DBL_EPSILON
@@ -104,8 +105,8 @@ uint32_t sw2_( int nnounce )
 }
 
 typedef struct {
-    sph_sha256_context      sha256;
-    sph_sha512_context      sha512;
+    sha256_context           sha256;
+    sph_sha512_context       sha512;
     sph_keccak512_context    keccak;
     sph_whirlpool_context    whirlpool;
     sph_haval256_5_context   haval;
@@ -117,7 +118,7 @@ m7m_ctx_holder m7m_ctx;
 
 void init_m7m_ctx()
 {
-    sph_sha256_init( &m7m_ctx );
+    sha256_ctx_init( &m7m_ctx.sha256 );
     sph_sha512_init( &m7m_ctx.sha512 );
     sph_keccak512_init( &m7m_ctx.keccak );
     sph_whirlpool_init( &m7m_ctx.whirlpool );
@@ -153,11 +154,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
     m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
     memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-    sph_sha256_context ctxf_sha256;
 
     memcpy(data, pdata, 80);
 
-    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
+    sha256_update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
     sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
     sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
     sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
@@ -189,8 +189,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
         memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
 
-        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
-        sph_sha256_close( &ctx2.sha256, bhash[0] );
+        sha256_update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
+        sha256_final( &ctx2.sha256, bhash[0] );
 
         sph_sha512(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
         sph_sha512_close( &ctx2.sha512, bhash[1] );
@@ -225,9 +225,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
         bytes = mpz_sizeinbase(product, 256);
         mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
 
-        sph_sha256_init( &ctxf_sha256 );
-        sph_sha256( &ctxf_sha256, bdata, bytes );
-        sph_sha256_close( &ctxf_sha256, hash );
+        sha256_full( hash, bdata, bytes );
 
         digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
         mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
@@ -260,10 +258,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
             mpzscale=bytes;
             mpz_export(bdata, NULL, -1, 1, 0, 0, product);
 
-            sph_sha256_init( &ctxf_sha256 );
-            sph_sha256( &ctxf_sha256, bdata, bytes );
-            sph_sha256_close( &ctxf_sha256, hash );
-	}
+            sha256_full( hash, bdata, bytes );
+	     }
 
         if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
              && !opt_benchmark ) )
diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c
index 94f3417..e91b287 100644
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -7,24 +7,19 @@
 #include <string.h>
 #include <stdio.h>
 #include "sph_ripemd.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 void lbry_hash(void* output, const void* input)
 {
-   sph_sha256_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sha256_context        ctx_sha256 __attribute__ ((aligned (64)));
    sph_sha512_context    ctx_sha512 __attribute__ ((aligned (64)));
    sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
    uint32_t _ALIGN(64) hashA[16];
    uint32_t _ALIGN(64) hashB[16];
    uint32_t _ALIGN(64) hashC[16];
 
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, input, 112 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
+   sha256_full( hashA, input, 112 );
+   sha256_full( hashA, hashA, 32 );
 
    sph_sha512_init( &ctx_sha512 );
    sph_sha512( &ctx_sha512, hashA, 32 );
@@ -38,15 +33,13 @@ void lbry_hash(void* output, const void* input)
    sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
    sph_ripemd160_close( &ctx_ripemd, hashC );
 
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashB, 20 );
-   sph_sha256( &ctx_sha256, hashC, 20 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
+   sha256_ctx_init( &ctx_sha256 );
+   sha256_update( &ctx_sha256, hashB, 20 );
+   sha256_update( &ctx_sha256, hashC, 20 );
+   sha256_final( &ctx_sha256, hashA );
 
+   sha256_full( hashA, hashA, 32 );
+   
    memcpy( output, hashA, 32 );
 }
 
diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c
index 7cb4c82..709b268 100644
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -69,8 +69,12 @@ typedef unsigned int  uint;
 #define SCRYPT_HASH_BLOCK_SIZE 64U
 #define SCRYPT_HASH_DIGEST_SIZE 32U
 
-#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
-#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define ROTL32(a,b) rol32(a,b)
+#define ROTR32(a,b) ror32(a,b)
+
 
 #define U8TO32_BE(p) \
     (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
new file mode 100644
index 0000000..19ff9cd
--- /dev/null
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -0,0 +1,3981 @@
+#include "scrypt-core-4way.h"
+
+//////////////////////////////////////////////////////////////////////////
+//
+//  Optimized Salsa implementation inspired by Pooler.
+//  Any similarities are not a coincidence.
+//
+//  Implementations include reference X64, SSE2, AVX2 & AVX512
+//  using both serial and parallel vectoring using SIMD instruction.
+//
+//  Generic macros are providedi and invoked with different targets depending
+//  on level of parallelism and data organization. Targets for any macros
+//  needed must be defined by the calling function. XOR, ROL32 and ADD32
+//  are needed in all cases. Additionally ROL_1X32, SWAP_64 and ROR_1X32
+//  shuffles are needed for serial SIMD.
+//
+//  SALSA_8ROUNDS_SIMD uses vectors on serial data rather than traditional
+//  n-way parallel hashing.
+//  The SIMD version has different implied arguments {X0:X3}, representing
+//  an array of 4 vectors of 4 32 bit words, while the version used for
+//  regular parallel hashing has {x0:xf} representing array of 16 by 32 bit
+//  words.
+//  These arguments must be defined by the calling function.
+//  The calling function must also define targets for all macros used for
+//  arithmetic, logic and shuffling: XOR, ROL32, ADD32 for all targets and
+//  ROL_1X32, SWAP_64, ROR_1X32 for serial SIMD targets.
+//
+//  Serial and parallel SIMD will be combined with AVX2 doing 2 way 
+//  parallel over 4 way linear for 8 way throughput, and AVX512 doing
+//  4 way parallel over 4 way linear for 16 way thoughput.
+//
+//  The term SIMD128 here refers to vectors that contain multiple contiguous
+//  data from a single stream (lane) as opposed to parallel vectors that
+//  contain interleaved words of data from multiple streams.
+//
+//  The sequencing of techniques in the naming convention is a little
+//  mixed up. The logical hierarchy top down is to put Nbuffs at the top
+//  where each buffer then performs another technique.
+//
+//  Although, Nway and SIMS128 are listed in top down order Nbuffs is
+//  always listed last:
+//
+//  scrypt_core_simd128_2way means a linear simd operation on 2 parallel
+//  streams of data while
+//   scrypt_core_2way_simd128 is 2 parallel streams linear SIMD vectors.
+//
+///////////////////////////////////////////////////////////////////////////
+
+
+// Used by all targets, needs XOR, ROL32 & ADD32 macros defined
+// Function, return typically overwrites in1
+//
+#define ARX( in1, in2, in3, n ) \
+   XOR( in1, ROL32( ADD32( in2, in3 ), n ) )
+
+// Multi buffering has 2 main benefits and one drawback. 
+// Traditionally double buffering has been used to empty one bucket
+// while another is filling. This requires a second (or 3rd, etc)
+// bucket. The computing analogy is to use 2 registers, 1 to read
+// and 1 to write, and switch back and forth.
+//
+// The second benefit in computing is using multiple registers to 
+// provide data independence that improves multiple instruction issue and
+// pipelining in the CPU. The number of buffers is limited by the number
+// of registers available. Three seems to be a swet spot as a 4 variable
+// data set uses 12 registers triple buffered, leaving 4 of 16 as temps.
+// Many pipelined instructions require 3 clocks to complete and triple
+// bufferin keeps the pipeline full. Many execution units are also 3 wide
+// allowing up to 3 similar instructions to be issued per clock.
+// However, execution units are shared by hyperthreading which reduces
+// the effect on a single thread.
+//  
+// The drawback is the increased size of the data. Although multi buffering
+// also improves memory throughput this is offset by the amount of
+// memory required and it's effect on cache performance and will eventually
+// hit memory bus saturation.
+//
+// For example scryptn2 struggles with more than 4 buffers, multi
+// buffered and parallel SIMD combined, and performance drops. This can
+// be mitigated somewhat by reducing the number of CPU threads but
+// ultimately excessive multi buffering has a negative impact.
+//
+// Unlike paralle SIMD, increasing multi buffering does not require a
+// CPU technology increase, ie SSE2 to AVX2 or AVX2 TO AVX512.
+// SSE2 is limited to 4 way SIMD but no theoretical limit to multibuffering.
+// Multi buffering  also does not suffer the clock penalty of increasing
+// parallism.
+//
+// Multi buffering implementations here focus on powers of 2,
+// to match sha256 without re-interleaving the data.
+//
+// A decision will have to be made at run time, based of the N factor,
+// whether to use multi buffering or serial execution.
+
+// Need TYPE macro defined.
+#define ARX_2BUF( a1, a2, a3, b1, b2, b3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+} while (0);
+
+#define ARX_3BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   TYPE tc = ADD32( c2, c3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   tc = ROL32( tc, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+   c1 = XOR( c1, tc ); \
+} while (0);
+
+// use 16 regs   AVX, AVX2, 8 buf for AVX512?
+#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \
+do{ \
+   TYPE ta = ADD32( a2, a3 ); \
+   TYPE tb = ADD32( b2, b3 ); \
+   TYPE tc = ADD32( c2, c3 ); \
+   TYPE td = ADD32( d2, d3 ); \
+   ta = ROL32( ta, n ); \
+   tb = ROL32( tb, n ); \
+   tc = ROL32( tc, n ); \
+   td = ROL32( td, n ); \
+   a1 = XOR( a1, ta ); \
+   b1 = XOR( b1, tb ); \
+   c1 = XOR( c1, tc ); \
+   d1 = XOR( d1, td ); \
+} while (0);
+
+
+// Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 &
+// ROR_1X32 defined.
+//
+// Implied arguments ( X0 = { x3, x2, x1, x0 },
+//                     X1 = { x7, x6, x5, x4 },
+//                     X3 = { xb, xa, x9, x8 },
+//                     X3 = { xf, xe, xd, xc } )
+//
+#define SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ) \
+   /* Operate on columns */ \
+   X1 = ARX( X1, X0, X3,  7 );  /* ( x4, x0, xc,  7 )  */ \
+   X2 = ARX( X2, X1, X0,  9 );  /* ( x8, x4, x0,  9 )  */ \
+   X3 = ARX( X3, X2, X1, 13 );  /* ( xc, x8, x4, 13 )  */ \
+   X0 = ARX( X0, X3, X2, 18 );  /* ( x0, xc, x8, 18 )  */ \
+   /* Rearrange data */ \
+   X1 = ROL_1X32( X1 ); \
+   X3 = ROR_1X32( X3 ); \
+   X2 = SWAP_64( X2 ); \
+   /* Operate on rows */ \
+   X3 = ARX( X3, X0, X1,  7 ); \
+   X2 = ARX( X2, X3, X0,  9 ); \
+   X1 = ARX( X1, X2, X3, 13 ); \
+   X0 = ARX( X0, X1, X2, 18 ); \
+   /* Rearrange data */ \
+   X3 = ROL_1X32( X3 ); \
+   X1 = ROR_1X32( X1 ); \
+   X2 = SWAP_64( X2 ); \
+
+// Final round optimization, don't rearange data back to original order on exit
+// Used only on pre-AVX2 CPUs where blend instruction is not avaiable.
+// It saves a few redundant shuffles.
+#define SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ) \
+   /* Operate on columns */ \
+   X1 = ARX( X1, X0, X3,  7 );  /* ( x4, x0, xc,  7 )  */ \
+   X2 = ARX( X2, X1, X0,  9 );  /* ( x8, x4, x0,  9 )  */ \
+   X3 = ARX( X3, X2, X1, 13 );  /* ( xc, x8, x4, 13 )  */ \
+   X0 = ARX( X0, X3, X2, 18 );  /* ( x0, xc, x8, 18 )  */ \
+   /* Rearrange data */ \
+   X1 = ROL_1X32( X1 ); \
+   X3 = ROR_1X32( X3 ); \
+   X2 = SWAP_64( X2 ); \
+   /* Operate on rows */ \
+   X3 = ARX( X3, X0, X1,  7 ); \
+   X2 = ARX( X2, X3, X0,  9 ); \
+   X1 = ARX( X1, X2, X3, 13 ); \
+   X0 = ARX( X0, X1, X2, 18 ); \
+   /* Final round, don't rearrange data
+   X1 = ROR_1X32( X1 ); \
+   X2 = SWAP_64( X2 ); \
+   X3 = ROL_1X32( X3 ); */
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
+#define SALSA_2ROUNDS_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 );
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 )
+#define SALSA_2ROUNDS_SIMD128_4BUF \
+   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
+   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
+   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
+   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XD1 = ROL_1X32( XD1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XD3 = ROR_1X32( XD3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 ); \
+   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
+   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
+   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
+   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XD3 = ROL_1X32( XD3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC1 = ROR_1X32( XC1 ); \
+   XD1 = ROR_1X32( XD1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 );
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \
+   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
+   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
+   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
+   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XD1 = ROL_1X32( XD1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XD3 = ROR_1X32( XD3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XD2 = SWAP_64( XD2 ); \
+   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
+   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
+   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
+   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, XD0, XD1, XD2, 18 );
+
+// Inlined ARX
+#define SALSA_2ROUNDS_SIMD128_3BUF \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE TC = ADD32( XC0, XC3 ); \
+   TA = ROL32( TA, 7 ); \
+   TB = ROL32( TB, 7 ); \
+   TC = ROL32( TC, 7 ); \
+   XA1 = XOR( XA1, TA ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   TC = ADD32( XC1, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   TC = ADD32( XC2, XC1 ); \
+   TA = ROL32( TA, 13 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   TC = ADD32( XC3, XC2 ); \
+   TA = ROL32( TA, 18 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA0 = XOR( XA0, TA ); \
+   TB = ROL32( TB, 18 ); \
+   XB0 = XOR( XB0, TB ); \
+   TC = ROL32( TC, 18 ); \
+   XC0 = XOR( XC0, TC ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   TC = ADD32( XC0, XC1 ); \
+   TA = ROL32( TA, 7 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 7 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 7 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   TC = ADD32( XC3, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   TA = ROL32( TA, 13 ); \
+   TC = ADD32( XC2, XC3 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = XOR( XA1, TA ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   TA = ROL32( TA, 18); \
+   TC = ADD32( XC1, XC2 ); \
+   XA2 = SWAP_64( XA2 ); \
+   TB = ROL32( TB, 18); \
+   XA0 = XOR( XA0, TA ); \
+   XB2 = SWAP_64( XB2 ); \
+   TC = ROL32( TC, 18); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC0 = XOR( XC0, TC ); \
+   XC1 = ROR_1X32( XC1 ); \
+} while (0);
+   
+
+// slow rol, an attempt to optimze non-avx512 bit rotations
+#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE TC = ADD32( XC0, XC3 ); \
+   TYPE T  = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
+   XA1 = XOR( XA1, T  ); \
+   T = _mm_slli_epi32( TB, 7 );\
+   XA1 = XOR( XA1, TA  ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
+   XB1 = XOR( XB1, T ); \
+   T = _mm_slli_epi32( TC, 7 );\
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, T ); \
+   TC = _mm_srli_epi32( TC, 25 );\
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   TC = ADD32( XC1, XC0 ); \
+   T  = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   T = _mm_slli_epi32( TB, 9 );\
+   TB = _mm_srli_epi32( TB, 23 );\
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, T ); \
+   T = _mm_slli_epi32( TC, 9 );\
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, T ); \
+   TC = _mm_srli_epi32( TC, 23 );\
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   TC = ADD32( XC2, XC1 ); \
+   T  = _mm_slli_epi32( TA, 13); \
+   TA = _mm_srli_epi32( TA, 19 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XA3 = XOR( XA3, T ); \
+   XB1 = ROL_1X32( XB1 ); \
+   T  = _mm_slli_epi32( TB, 13); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XA3 = XOR( XA3, TA ); \
+   XB3 = XOR( XB3, T ); \
+   T  = _mm_slli_epi32( TC, 13); \
+   TC = _mm_srli_epi32( TC, 19 ); \
+   XB3 = XOR( XB3, TB ); \
+   XC3 = XOR( XC3, T ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   TC = ADD32( XC3, XC2 ); \
+   T  = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XA0 = XOR( XA0, T ); \
+   T  = _mm_slli_epi32( TB, 18 ); \
+   XB2 = SWAP_64( XB2 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XB0 = XOR( XB0, T ); \
+   T = _mm_slli_epi32( TC, 18 ); \
+   XA0 = XOR( XA0, TA ); \
+   TC = _mm_srli_epi32( TC, 14 ); \
+   XC0 = XOR( XC0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XC0 = XOR( XC0, TC ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   TC = ADD32( XC0, XC1 ); \
+   TA = ROL32( TA, 7 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, TA ); \
+   TB = ROL32( TB, 7 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, TB ); \
+   TC = ROL32( TC, 7 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, TC ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   TC = ADD32( XC3, XC0 ); \
+   TA = ROL32( TA, 9 ); \
+   TB = ROL32( TB, 9 ); \
+   TC = ROL32( TC, 9 ); \
+   XA2 = XOR( XA2, TA ); \
+   XB2 = XOR( XB2, TB ); \
+   XC2 = XOR( XC2, TC ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   TA = ROL32( TA, 13 ); \
+   TC = ADD32( XC2, XC3 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   TB = ROL32( TB, 13 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XA1 = XOR( XA1, TA ); \
+   TC = ROL32( TC, 13 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XB1 = XOR( XB1, TB ); \
+   XC1 = XOR( XC1, TC ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   TA = ROL32( TA, 18); \
+   TC = ADD32( XC1, XC2 ); \
+   XA2 = SWAP_64( XA2 ); \
+   TB = ROL32( TB, 18); \
+   XA0 = XOR( XA0, TA ); \
+   XB2 = SWAP_64( XB2 ); \
+   TC = ROL32( TC, 18); \
+   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC0 = XOR( XC0, TC ); \
+   XC1 = ROR_1X32( XC1 ); \
+} while (0);
+
+
+/*
+// Standard version using ARX
+#define SALSA_2ROUNDS_SIMD128_3BUF \
+   ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, 7 ); \
+   ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0,  9 ); \
+   ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, 13 ); \
+   ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1,  7 ); \
+   ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0,  9 ); \
+   ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, 13 ); \
+   ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, 18 ); \
+   XA3 = ROL_1X32( XA3 ); \
+   XB3 = ROL_1X32( XB3 ); \
+   XC3 = ROL_1X32( XC3 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB1 = ROR_1X32( XB1 ); \
+   XC1 = ROR_1X32( XC1 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 );
+*/
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_3BUF \
+   ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
+             XC1, XC0, XC3, 7 ); \
+   ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
+             XC2, XC1, XC0,  9 ); \
+   ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
+             XC3, XC2, XC1, 13 ); \
+   ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
+             XC0, XC3, XC2, 18 ); \
+   XA1 = ROL_1X32( XA1 ); \
+   XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
+   XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
+   ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
+             XC3, XC0, XC1,  7 ); \
+   ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
+             XC2, XC3, XC0,  9 ); \
+   ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
+             XC1, XC2, XC3, 13 ); \
+   ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
+             XC0, XC1, XC2, 18 );
+
+
+#define SALSA_8ROUNDS_SIMD128 \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 );
+
+#define SALSA_8ROUNDS_FINAL_SIMD128 \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
+   SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 );
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
+#define SALSA_8ROUNDS_SIMD128_2BUF \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_SIMD128_2BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_2BUF;
+
+#define SALSA_8ROUNDS_SIMD128_3BUF \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_SIMD128_3BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_3BUF;
+
+// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3, )
+#define SALSA_8ROUNDS_SIMD128_4BUF \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF;
+
+#define SALSA_8ROUNDS_FINAL_SIMD128_4BUF \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_SIMD128_4BUF; \
+   SALSA_2ROUNDS_FINAL_SIMD128_4BUF;
+
+// Used by reference code and pure parallel implementations
+//
+// Implied arguments ( x0, x1, x2, x3, x4, x5, x6, x7,
+//                     x8, x9, xa, xb, xc, xd, xe, xf )
+//
+#define SALSA_COLUMN \
+   x4 = ARX( x4, x0, xc,  7 ); \
+   x9 = ARX( x9, x5, x1,  7 ); \
+   xe = ARX( xe, xa, x6,  7 ); \
+   x3 = ARX( x3, xf, xb,  7 ); \
+   x8 = ARX( x8, x4, x0,  9 ); \
+   xd = ARX( xd, x9, x5,  9 ); \
+   x2 = ARX( x2, xe, xa,  9 ); \
+   x7 = ARX( x7, x3, xf,  9 ); \
+   xc = ARX( xc, x8, x4, 13 ); \
+   x1 = ARX( x1, xd, x9, 13 ); \
+   x6 = ARX( x6, x2, xe, 13 ); \
+   xb = ARX( xb, x7, x3, 13 ); \
+   x0 = ARX( x0, xc, x8, 18 ); \
+   x5 = ARX( x5, x1, xd, 18 ); \
+   xa = ARX( xa, x6, x2, 18 ); \
+   xf = ARX( xf, xb, x7, 18 ) 
+   
+#define SALSA_ROW \
+   x1 = ARX( x1, x0, x3,  7 ); \
+   x6 = ARX( x6, x5, x4,  7 ); \
+   xb = ARX( xb, xa, x9,  7 ); \
+   xc = ARX( xc, xf, xe,  7 ); \
+   x2 = ARX( x2, x1, x0,  9 ); \
+   x7 = ARX( x7, x6, x5,  9 ); \
+   x8 = ARX( x8, xb, xa,  9 ); \
+   xd = ARX( xd, xc, xf,  9 ); \
+   x3 = ARX( x3, x2, x1, 13 ); \
+   x4 = ARX( x4, x7, x6, 13 ); \
+   x9 = ARX( x9, x8, xb, 13 ); \
+   xe = ARX( xe, xd, xc, 13 ); \
+   x0 = ARX( x0, x3, x2, 18 ); \
+   x5 = ARX( x5, x4, x7, 18 ); \
+   xa = ARX( xa, x9, x8, 18 ); \
+   xf = ARX( xf, xe, xd, 18 );
+
+#define SALSA_2ROUNDS    SALSA_COLUMN; SALSA_ROW;
+
+#define SALSA_8ROUNDS \
+   SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Tested OK but very slow
+// 16 way parallel, requires 16x32 interleaving
+static void xor_salsa8_16way( __m512i * const B, const __m512i * const C)
+{
+   __m512i x0 = B[ 0] = _mm512_xor_si512( B[ 0], C[ 0] );
+   __m512i x1 = B[ 1] = _mm512_xor_si512( B[ 1], C[ 1] );
+   __m512i x2 = B[ 2] = _mm512_xor_si512( B[ 2], C[ 2] );
+   __m512i x3 = B[ 3] = _mm512_xor_si512( B[ 3], C[ 3] );
+   __m512i x4 = B[ 4] = _mm512_xor_si512( B[ 4], C[ 4] );
+   __m512i x5 = B[ 5] = _mm512_xor_si512( B[ 5], C[ 5] );
+   __m512i x6 = B[ 6] = _mm512_xor_si512( B[ 6], C[ 6] );
+   __m512i x7 = B[ 7] = _mm512_xor_si512( B[ 7], C[ 7] );
+   __m512i x8 = B[ 8] = _mm512_xor_si512( B[ 8], C[ 8] );
+   __m512i x9 = B[ 9] = _mm512_xor_si512( B[ 9], C[ 9] );
+   __m512i xa = B[10] = _mm512_xor_si512( B[10], C[10] );
+   __m512i xb = B[11] = _mm512_xor_si512( B[11], C[11] );
+   __m512i xc = B[12] = _mm512_xor_si512( B[12], C[12] );
+   __m512i xd = B[13] = _mm512_xor_si512( B[13], C[13] );
+   __m512i xe = B[14] = _mm512_xor_si512( B[14], C[14] );
+   __m512i xf = B[15] = _mm512_xor_si512( B[15], C[15] );
+
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   
+   B[ 0] = _mm512_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm512_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm512_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm512_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm512_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm512_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm512_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm512_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm512_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm512_add_epi32( B[ 9], x9 );
+   B[10] = _mm512_add_epi32( B[10], xa );
+   B[11] = _mm512_add_epi32( B[11], xb );
+   B[12] = _mm512_add_epi32( B[12], xc );
+   B[13] = _mm512_add_epi32( B[13], xd );
+   B[14] = _mm512_add_epi32( B[14], xe );
+   B[15] = _mm512_add_epi32( B[15], xf );
+}
+
+void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*16 );
+      xor_salsa8_16way( &X[ 0], &X[16] );
+      xor_salsa8_16way( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m512_ovly *vptr[16];   // pointer to V offset for each lane 
+      m512_ovly *x16 = (m512_ovly*)(&X[16]);
+
+      // create pointers to V for each lane using data from each lane of X[16]
+      // as index.
+      for ( int l = 0; l < 16; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m512_ovly v;    // V value assembled from different indexes
+         for ( int l = 0; l < 8; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm512_xor_si512( X[ k ], v.m512 );
+      }
+
+      xor_salsa8_16way( &X[ 0], &X[16] );
+      xor_salsa8_16way( &X[16], &X[ 0] );
+   }
+}
+
+// Working, not up to date, needs stream optimization.
+// 4x32 interleaving
+static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
+{
+   __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m512i *B = (__m512i*)b; 
+   const __m512i *C = (const __m512i*)c;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm512_xor_si512( B[0], C[0] );
+   B[1] = _mm512_xor_si512( B[1], C[1] );
+   B[2] = _mm512_xor_si512( B[2], C[2] );
+   B[3] = _mm512_xor_si512( B[3], C[3] );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[1], B[0] );
+   X0 = _mm512_mask_blend_epi64( 0x30, B[3], B[2] );
+   X0 = _mm512_mask_blend_epi64( 0x0f, X0, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[2], B[1] );
+   X1 = _mm512_mask_blend_epi64( 0x30, B[0], B[3] );
+   X1 = _mm512_mask_blend_epi64( 0x0f, X1, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[3], B[2] );
+   X2 = _mm512_mask_blend_epi64( 0x30, B[1], B[0] );
+   X2 = _mm512_mask_blend_epi64( 0x0f, X2, Y0 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x03, B[0], B[3] );
+   X3 = _mm512_mask_blend_epi64( 0x30, B[2], B[1] );
+   X3 = _mm512_mask_blend_epi64( 0x0f, X3, Y0 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm512_shufll_128 
+   #define ROR_1X32    mm512_shuflr_128
+   #define SWAP_64     mm512_swap_256
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   Y0 = _mm512_mask_blend_epi64( 0xc0, X0, X1 );
+   Y1 = _mm512_mask_blend_epi64( 0x03, X0, X1 );
+   Y2 = _mm512_mask_blend_epi64( 0x0c, X0, X1 );
+   Y3 = _mm512_mask_blend_epi64( 0x30, X0, X1 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x30, Y0, X2 );
+   Y1 = _mm512_mask_blend_epi64( 0xc0, Y1, X2 );
+   Y2 = _mm512_mask_blend_epi64( 0x03, Y2, X2 );
+   Y3 = _mm512_mask_blend_epi64( 0x0c, Y3, X2 );
+
+   Y0 = _mm512_mask_blend_epi64( 0x0c, Y0, X3 );
+   Y1 = _mm512_mask_blend_epi64( 0x30, Y1, X3 );
+   Y2 = _mm512_mask_blend_epi64( 0xc0, Y2, X3 );
+   Y3 = _mm512_mask_blend_epi64( 0x03, Y3, X3 );
+
+   B[0] = _mm512_add_epi32( B[0], Y0 );
+   B[1] = _mm512_add_epi32( B[1], Y1 );
+   B[2] = _mm512_add_epi32( B[2], Y2 );
+   B[3] = _mm512_add_epi32( B[3], Y3 );
+}
+
+// data format for 512 bits: 4 * ( 4 way 32 )
+// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
+//   l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }
+
+void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 4*128 );
+      salsa8_simd128_4way( &X[ 0], &X[16] );
+      salsa8_simd128_4way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      uint32_t x16[4];   // index into V for each lane
+      memcpy( x16, &X[16], 16 );
+      x16[0] = 32 * ( x16[0] & ( N-1) );
+      x16[1] = 32 * ( x16[1] & ( N-1) );
+      x16[2] = 32 * ( x16[2] & ( N-1) );
+      x16[3] = 32 * ( x16[3] & ( N-1) );
+      m128_ovly *v = (m128_ovly*)V;
+
+      for( int k = 0; k < 32; k++ )
+      {
+         X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3],
+                                                    v[ x16[2] + k ].u32[2],
+                                                    v[ x16[1] + k ].u32[1],
+                                                    v[ x16[0] + k ].u32[0] ) );
+      }
+
+      salsa8_simd128_4way( &X[ 0], &X[16] );
+      salsa8_simd128_4way( &X[16], &X[ 0] );
+   }
+}
+
+// not working, occasional accepted shares, not up to date.
+// 4x128 interleaving
+static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+{
+   __m512i X0, X1, X2, X3;
+   uint32_t *b = (uint32_t*)B;
+   m512_ovly y[4], z[4];
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm512_xor_si512( B[0], C[0] );
+   B[1] = _mm512_xor_si512( B[1], C[1] );
+   B[2] = _mm512_xor_si512( B[2], C[2] );
+   B[3] = _mm512_xor_si512( B[3], C[3] );
+
+   // { l3u15, l3u10, l3u5, l3u0,  l2u15, l2u10, l2u5, l2u0,
+   //   l1u15, l1u10, l1u5, l1u0,  l0u15, l0u10, l0u5, l0u0 }
+
+   //  b index = row index     + lane index + unit index
+   //          = ( 8 * (u/4) ) +  ( 4*l )   +  ( u%4 )
+
+   X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12],   // lane 3[3:0]
+                          b[59], b[42], b[25], b[ 8],   // lane 2[3:0]
+                          b[55], b[38], b[21], b[ 4],   // lane 1[3:0]
+                          b[51], b[34], b[17], b[ 0] ); // lane 0[3:0]
+
+   X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], 
+                          b[11], b[58], b[41], b[24],  
+                          b[ 7], b[54], b[37], b[20],
+                          b[ 3], b[50], b[33], b[16] ); // lane 0[7:4]
+
+   X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44],
+                          b[27], b[10], b[57], b[40],
+                          b[23], b[ 6], b[53], b[36],
+                          b[19], b[ 2], b[49], b[32] );
+
+   X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60],
+                          b[43], b[26], b[ 9], b[56],
+                          b[39], b[22], b[ 5], b[52],
+                          b[35], b[18], b[ 1], b[48] );
+
+
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm512_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm512_shuflr128_32
+   #define SWAP_64     mm512_swap128_64
+   #define ROL32       _mm512_rol_epi32
+   #define ADD32       _mm512_add_epi32
+   #define XOR         _mm512_xor_si512
+
+   SALSA_8ROUNDS_FINAL_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   y[0].m512 = X0;
+   y[1].m512 = X1;
+   y[2].m512 = X2;
+   y[3].m512 = X3;
+
+   // lane 0
+   z[0].u32[ 0    ] = y[0].u32[ 0];
+   z[0].u32[ 3    ] = y[1].u32[ 0];
+   z[0].u32[ 2    ] = y[2].u32[ 0];
+   z[0].u32[ 1    ] = y[3].u32[ 0];
+
+   // lane 1
+   z[0].u32[ 0+ 4 ] = y[0].u32[ 4];
+   z[0].u32[ 3+ 4 ] = y[1].u32[ 4];
+   z[0].u32[ 2+ 4 ] = y[2].u32[ 4];
+   z[0].u32[ 1+ 4 ] = y[3].u32[ 4];
+
+   // lane 2
+   z[0].u32[ 0+ 8 ] = y[0].u32[ 8];
+   z[0].u32[ 3+ 8 ] = y[1].u32[ 8];
+   z[0].u32[ 2+ 8 ] = y[2].u32[ 8];
+   z[0].u32[ 1+ 8 ] = y[3].u32[ 8];
+   
+   // lane 3
+   z[0].u32[ 0+12 ] = y[0].u32[12];
+   z[0].u32[ 3+12 ] = y[1].u32[12];
+   z[0].u32[ 2+12 ] = y[2].u32[12];
+   z[0].u32[ 1+12 ] = y[3].u32[12];
+
+   // lane 0
+   z[1].u32[ 1    ] = y[0].u32[ 1];
+   z[1].u32[ 0    ] = y[1].u32[ 1];
+   z[1].u32[ 3    ] = y[2].u32[ 1];
+   z[1].u32[ 2    ] = y[3].u32[ 1];
+
+   //lane 1
+   z[1].u32[ 1+ 4 ] = y[0].u32[ 5];
+   z[1].u32[ 0+ 4 ] = y[1].u32[ 5];
+   z[1].u32[ 3+ 4 ] = y[2].u32[ 5];
+   z[1].u32[ 2+ 4 ] = y[3].u32[ 5];
+
+   // lane 2
+   z[1].u32[ 1+ 8 ] = y[0].u32[ 9];
+   z[1].u32[ 0+ 8 ] = y[1].u32[ 9];
+   z[1].u32[ 3+ 8 ] = y[2].u32[ 9];
+   z[1].u32[ 2+ 8 ] = y[3].u32[ 9];
+
+   // lane 3
+   z[1].u32[ 1+12 ] = y[0].u32[13];
+   z[1].u32[ 0+12 ] = y[1].u32[13];
+   z[1].u32[ 3+12 ] = y[2].u32[13];
+   z[1].u32[ 2+12 ] = y[3].u32[13];
+  
+   // lane 0
+   z[2].u32[ 2    ] = y[0].u32[2];
+   z[2].u32[ 1    ] = y[1].u32[2];
+   z[2].u32[ 0    ] = y[2].u32[2];
+   z[2].u32[ 3    ] = y[3].u32[2];
+
+   // lane 1
+   z[2].u32[ 2+ 4 ] = y[0].u32[6];
+   z[2].u32[ 1+ 4 ] = y[1].u32[6];
+   z[2].u32[ 0+ 4 ] = y[2].u32[6];
+   z[2].u32[ 3+ 4 ] = y[3].u32[6];
+
+   // lane 2
+   z[2].u32[ 2+ 8 ] = y[0].u32[10];
+   z[2].u32[ 1+ 8 ] = y[1].u32[10];
+   z[2].u32[ 0+ 8 ] = y[2].u32[10];
+   z[2].u32[ 3+ 8 ] = y[3].u32[10];
+
+   // lane 3
+   z[2].u32[ 2+12 ] = y[0].u32[14];
+   z[2].u32[ 1+12 ] = y[1].u32[14];
+   z[2].u32[ 0+12 ] = y[2].u32[14];
+   z[2].u32[ 3+12 ] = y[3].u32[14];
+   
+   // lane 0
+   z[3].u32[ 3    ] = y[0].u32[ 3];
+   z[3].u32[ 2    ] = y[1].u32[ 3];
+   z[3].u32[ 1    ] = y[2].u32[ 3];
+   z[3].u32[ 0    ] = y[3].u32[ 3];
+
+   // lane 1
+   z[3].u32[ 3+ 4 ] = y[0].u32[ 7];
+   z[3].u32[ 2+ 4 ] = y[1].u32[ 7];
+   z[3].u32[ 1+ 4 ] = y[2].u32[ 7];
+   z[3].u32[ 0+ 4 ] = y[3].u32[ 7];
+
+   // lane 2
+   z[3].u32[ 3+ 8 ] = y[0].u32[11];
+   z[3].u32[ 2+ 8 ] = y[1].u32[11];
+   z[3].u32[ 1+ 8 ] = y[2].u32[11];
+   z[3].u32[ 0+ 8 ] = y[3].u32[11];
+
+   // lane 1
+   z[3].u32[ 3+12 ] = y[0].u32[15];
+   z[3].u32[ 2+12 ] = y[1].u32[15];
+   z[3].u32[ 1+12 ] = y[2].u32[15];
+   z[3].u32[ 0+12 ] = y[3].u32[15];
+
+   B[0] = _mm512_add_epi32( B[0], z[0].m512 );
+   B[1] = _mm512_add_epi32( B[1], z[1].m512 );
+   B[2] = _mm512_add_epi32( B[2], z[2].m512 );
+   B[3] = _mm512_add_epi32( B[3], z[3].m512 );
+}
+
+void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 8], X, 128*4 );
+      salsa8_4way_simd128( &X[0], &X[4] );
+      salsa8_4way_simd128( &X[4], &X[0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m512_ovly x16;
+      x16 = ( (m512_ovly*)X )[4];
+      uint32_t j0 = 8 * ( x16.u32[ 0] & ( N-1 ) );
+      uint32_t j1 = 8 * ( x16.u32[ 4] & ( N-1 ) );
+      uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) );
+      uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm512_xor_si512( X[k], m512_const_128( 
+                                   ( (m512_ovly*)V )[ j3+k ].m128[3],
+                                   ( (m512_ovly*)V )[ j2+k ].m128[2],
+                                   ( (m512_ovly*)V )[ j1+k ].m128[1],
+                                   ( (m512_ovly*)V )[ j0+k ].m128[0] ) );
+
+/*
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( 
+                   V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) );
+*/
+      salsa8_4way_simd128( &X[0], &X[4] );
+      salsa8_4way_simd128( &X[4], &X[0] );
+   }
+}
+   
+
+
+#endif // AVX512
+
+#if defined(__AVX2__)
+
+// 8x memory usage
+// Tested OK but slow scrypt, very slow scryptn2, 2x4way is faster
+// Crashes with large N & many threads, OOM? Use only for scrypt
+// 8x32 interleaving
+static void salsa8_8way( __m256i * const B, const __m256i * const C )
+{
+   __m256i x0 = B[ 0] = _mm256_xor_si256( B[ 0], C[ 0] );
+   __m256i x1 = B[ 1] = _mm256_xor_si256( B[ 1], C[ 1] );
+   __m256i x2 = B[ 2] = _mm256_xor_si256( B[ 2], C[ 2] );
+   __m256i x3 = B[ 3] = _mm256_xor_si256( B[ 3], C[ 3] );
+   __m256i x4 = B[ 4] = _mm256_xor_si256( B[ 4], C[ 4] );
+   __m256i x5 = B[ 5] = _mm256_xor_si256( B[ 5], C[ 5] );
+   __m256i x6 = B[ 6] = _mm256_xor_si256( B[ 6], C[ 6] );
+   __m256i x7 = B[ 7] = _mm256_xor_si256( B[ 7], C[ 7] );
+   __m256i x8 = B[ 8] = _mm256_xor_si256( B[ 8], C[ 8] );
+   __m256i x9 = B[ 9] = _mm256_xor_si256( B[ 9], C[ 9] );
+   __m256i xa = B[10] = _mm256_xor_si256( B[10], C[10] );
+   __m256i xb = B[11] = _mm256_xor_si256( B[11], C[11] );
+   __m256i xc = B[12] = _mm256_xor_si256( B[12], C[12] );
+   __m256i xd = B[13] = _mm256_xor_si256( B[13], C[13] );
+   __m256i xe = B[14] = _mm256_xor_si256( B[14], C[14] );
+   __m256i xf = B[15] = _mm256_xor_si256( B[15], C[15] );
+
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   B[ 0] = _mm256_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm256_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm256_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm256_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm256_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm256_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm256_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm256_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm256_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm256_add_epi32( B[ 9], x9 );
+   B[10] = _mm256_add_epi32( B[10], xa );
+   B[11] = _mm256_add_epi32( B[11], xb );
+   B[12] = _mm256_add_epi32( B[12], xc );
+   B[13] = _mm256_add_epi32( B[13], xd );
+   B[14] = _mm256_add_epi32( B[14], xe );
+   B[15] = _mm256_add_epi32( B[15], xf );
+}
+
+void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*8 );
+      salsa8_8way( &X[ 0], &X[16] );
+      salsa8_8way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly *vptr[8];   // pointer to V offset for each lane 
+      m256_ovly *x16 = (m256_ovly*)(&X[16]);
+
+      // create pointers to V for each lane using data from each lane of X[16]
+      // as index.
+      for ( int l = 0; l < 8; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m256_ovly v;    // V value assembled from different indexes
+         for ( int l = 0; l < 8; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm256_xor_si256( X[ k ], v.m256 );
+      }
+
+      salsa8_8way( &X[ 0], &X[16] );
+      salsa8_8way( &X[16], &X[ 0] );
+   }
+}
+
+// 2x memory usage
+// Working, not up to date, needs stream optimization.
+// Essentially Pooler 6way
+// 2x128 interleaved simd128
+//   ------- lane 1 -------    ------- lane 0 -------
+// { l1x3, l1x2, l1x1, l1x0,   l0x3, l0x2, l0x1, l0x0 }   b[3]  B[ 7: 0]
+// { l1x7, l1x6, l1x5, l1x4,   l0x7, l0x6, l0x5, l0x4 }   b[2]  B[15: 8]
+// { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
+// { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]
+
+static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+{
+   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm256_xor_si256( B[0], C[0] );
+   B[1] = _mm256_xor_si256( B[1], C[1] );
+   B[2] = _mm256_xor_si256( B[2], C[2] );
+   B[3] = _mm256_xor_si256( B[3], C[3] );
+
+   Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 );
+   X0 = _mm256_blend_epi32( B[3], B[2], 0x44 );
+   X0 = _mm256_blend_epi32( X0, Y0, 0x33);
+
+   Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 );
+   X1 = _mm256_blend_epi32( B[0], B[3], 0x44 );
+   X1 = _mm256_blend_epi32( X1, Y1, 0x33 );
+
+   Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 );
+   X2 = _mm256_blend_epi32( B[1], B[0], 0x44 );
+   X2 = _mm256_blend_epi32( X2, Y2, 0x33 );
+
+   Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 );
+   X3 = _mm256_blend_epi32( B[2], B[1], 0x44 );
+   X3 = _mm256_blend_epi32( X3, Y3, 0x33 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   // init with X0 then blend in the other elements
+
+   Y0 = _mm256_blend_epi32( X0, X1, 0x88 );
+   Y1 = _mm256_blend_epi32( X0, X1, 0x11 );   
+   Y2 = _mm256_blend_epi32( X0, X1, 0x22 );   
+   Y3 = _mm256_blend_epi32( X0, X1, 0x44 );     
+
+   Y0 = _mm256_blend_epi32( Y0, X2, 0x44 );
+   Y1 = _mm256_blend_epi32( Y1, X2, 0x88 );
+   Y2 = _mm256_blend_epi32( Y2, X2, 0x11 );
+   Y3 = _mm256_blend_epi32( Y3, X2, 0x22 );
+   
+   Y0 = _mm256_blend_epi32( Y0, X3, 0x22 );
+   Y1 = _mm256_blend_epi32( Y1, X3, 0x44 );
+   Y2 = _mm256_blend_epi32( Y2, X3, 0x88 );
+   Y3 = _mm256_blend_epi32( Y3, X3, 0x11 );
+   
+   B[0] = _mm256_add_epi32( B[0], Y0 );
+   B[1] = _mm256_add_epi32( B[1], Y1 );
+   B[2] = _mm256_add_epi32( B[2], Y2 );
+   B[3] = _mm256_add_epi32( B[3], Y3 );
+}
+
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 8], X, 128*2 );
+      salsa8_2way_simd128( &X[0], &X[4] );
+      salsa8_2way_simd128( &X[4], &X[0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly x16;
+      x16 = ( (m256_ovly*)X )[4];
+      uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) );
+      uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+         X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ],
+                                                            V[ j0+k ], 0x0f ) );
+
+      salsa8_2way_simd128( &X[0], &X[4] );
+      salsa8_2way_simd128( &X[4], &X[0] );
+   }
+}
+
+// Working
+// 2x128 interleaving
+static void salsa8_2way_simd128_2buf( __m256i * const BA, __m256i * const BB,
+      const __m256i * const CA, const __m256i * const CB )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
+   
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+
+}
+
+void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
+{
+   __m256i *X0 = X;
+   __m256i *X1 = X + 8;
+   __m256i *V0 = V;
+   __m256i *V1 = V + 8*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm256_stream_si256( V0 + i*8 + k, X0[k] );   
+         _mm256_stream_si256( V1 + i*8 + k, X1[k] );      
+      }
+      salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
+      salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      const m256_ovly x16a = ( (m256_ovly*)X0 )[4];
+      const m256_ovly x16b = ( (m256_ovly*)X1 )[4];
+      
+      const uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
+      const uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
+      const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
+      const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k );
+         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k );
+         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k );
+         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k );
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) );
+
+
+/*
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
+*/
+
+      }
+
+      salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
+      salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
+   }
+}
+
+// Triple buffered, not up to date, needs stream optimization
+// 2x128 interleaving
+static void salsa8_2way_simd128_3buf( __m256i * const BA, __m256i * const BB,
+      __m256i * const BC, const __m256i * const CA, const __m256i * const CB,
+      const __m256i * const CC )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BC[0] = _mm256_xor_si256( BC[0], CC[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BC[1] = _mm256_xor_si256( BC[1], CC[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BC[2] = _mm256_xor_si256( BC[2], CC[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+   BC[3] = _mm256_xor_si256( BC[3], CC[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[1], BC[0], 0x11 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
+   XC0 = _mm256_blend_epi32( BC[3], BC[2], 0x44 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
+   XC0 = _mm256_blend_epi32( XC0, YC0, 0x33);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[2], BC[1], 0x11 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
+   XC1 = _mm256_blend_epi32( BC[0], BC[3], 0x44 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
+   XC1 = _mm256_blend_epi32( XC1, YC0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[3], BC[2], 0x11 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
+   XC2 = _mm256_blend_epi32( BC[1], BC[0], 0x44 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
+   XC2 = _mm256_blend_epi32( XC2, YC0, 0x33 );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
+   YC0 = _mm256_blend_epi32( BC[0], BC[3], 0x11 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
+   XC3 = _mm256_blend_epi32( BC[2], BC[1], 0x44 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
+   XC3 = _mm256_blend_epi32( XC3, YC0, 0x33 );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
+   #define ROR_1X32    mm256_shuflr128_32
+   #define SWAP_64     mm256_swap128_64
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_3BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
+   YC0 = _mm256_blend_epi32( XC0, XC1, 0x88 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
+   YC1 = _mm256_blend_epi32( XC0, XC1, 0x11 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
+   YC2 = _mm256_blend_epi32( XC0, XC1, 0x22 );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
+   YC3 = _mm256_blend_epi32( XC0, XC1, 0x44 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
+   YC0 = _mm256_blend_epi32( YC0, XC2, 0x44 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
+   YC1 = _mm256_blend_epi32( YC1, XC2, 0x88 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
+   YC2 = _mm256_blend_epi32( YC2, XC2, 0x11 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
+   YC3 = _mm256_blend_epi32( YC3, XC2, 0x22 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
+   YC0 = _mm256_blend_epi32( YC0, XC3, 0x22 );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
+   YC1 = _mm256_blend_epi32( YC1, XC3, 0x44 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
+   YC2 = _mm256_blend_epi32( YC2, XC3, 0x88 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
+   YC3 = _mm256_blend_epi32( YC3, XC3, 0x11 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BC[0] = _mm256_add_epi32( BC[0], YC0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BC[1] = _mm256_add_epi32( BC[1], YC1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BC[2] = _mm256_add_epi32( BC[2], YC2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+   BC[3] = _mm256_add_epi32( BC[3], YC3 );
+
+}
+
+void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
+{
+   __m256i *X0 = X;
+   __m256i *X1 = X+8;
+   __m256i *X2 = X+16;
+   __m256i *V0 = V;
+   __m256i *V1 = V + 8*N;
+   __m256i *V2 = V + 16*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 8], X0, 128*2 );
+      memcpy( &V1[i * 8], X1, 128*2 );
+      memcpy( &V2[i * 8], X2, 128*2 );
+      salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
+                                &X0[4], &X1[4], &X2[4] );
+      salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
+                                &X0[0], &X1[0], &X2[0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m256_ovly x16a, x16b, x16c;
+      x16a = ( (m256_ovly*)X0 )[4];
+      x16b = ( (m256_ovly*)X1 )[4];
+      x16c = ( (m256_ovly*)X2 )[4];
+
+      uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
+      uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
+      uint32_t j0c = 8 * ( x16c.u32[0] & ( N-1 ) );
+      uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
+      uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
+      uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+         X0[k] = _mm256_xor_si256( X0[k],
+                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
+         X1[k] = _mm256_xor_si256( X1[k],
+                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
+         X2[k] = _mm256_xor_si256( X2[k],
+                       _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) );
+      }
+
+      salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], 
+                                &X0[4], &X1[4], &X2[4] );
+      salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],  
+                                &X0[0], &X1[0], &X2[0] );
+   }
+}
+
+
+// 2x memory usage
+
+// Tested OK, good speed
+//
+// Serial SIMD over 2 way parallel
+
+// Uses uint64_t as a poorman's vector then applying linear SIMD to the
+// pairs of data.
+//
+// Interleaving is standard 2 way.
+// Use 64 bit shuffles but 32 bit arithmetic.
+
+//  B = { lane1, lane0 }
+//  b[i] = { B[4*i+3], B[4*i+2], B[4*i+1], B[4*i] }
+
+// 2x32 interleaving
+static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c )
+{
+   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m256i *B = (__m256i*)b; 
+   const __m256i *C = (const __m256i*)c;
+
+   // mix C into B then shuffle B into X
+   B[0] = _mm256_xor_si256( B[0], C[0] );
+   B[1] = _mm256_xor_si256( B[1], C[1] );
+   B[2] = _mm256_xor_si256( B[2], C[2] );
+   B[3] = _mm256_xor_si256( B[3], C[3] );
+
+   Y0 = _mm256_blend_epi32( B[1], B[0], 0x03 );
+   X0 = _mm256_blend_epi32( B[3], B[2], 0x30 );
+   X0 = _mm256_blend_epi32( X0, Y0, 0x0f);
+
+   Y0 = _mm256_blend_epi32( B[2], B[1], 0x03 );
+   X1 = _mm256_blend_epi32( B[0], B[3], 0x30 );
+   X1 = _mm256_blend_epi32( X1, Y0, 0x0f );
+
+   Y0 = _mm256_blend_epi32( B[3], B[2], 0x03 );
+   X2 = _mm256_blend_epi32( B[1], B[0], 0x30 );
+   X2 = _mm256_blend_epi32( X2, Y0, 0x0f );
+
+   Y0 = _mm256_blend_epi32( B[0], B[3], 0x03 );
+   X3 = _mm256_blend_epi32( B[2], B[1], 0x30 );
+   X3 = _mm256_blend_epi32( X3, Y0, 0x0f );
+   
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+
+   SALSA_8ROUNDS_SIMD128;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   Y0 = _mm256_blend_epi32( X0, X1, 0xc0 );
+   Y1 = _mm256_blend_epi32( X0, X1, 0x03 );
+   Y2 = _mm256_blend_epi32( X0, X1, 0x0c );
+   Y3 = _mm256_blend_epi32( X0, X1, 0x30 );
+
+   Y0 = _mm256_blend_epi32( Y0, X2, 0x30 );
+   Y1 = _mm256_blend_epi32( Y1, X2, 0xc0 );
+   Y2 = _mm256_blend_epi32( Y2, X2, 0x03 );
+   Y3 = _mm256_blend_epi32( Y3, X2, 0x0c );
+
+   Y0 = _mm256_blend_epi32( Y0, X3, 0x0c );
+   Y1 = _mm256_blend_epi32( Y1, X3, 0x30 );
+   Y2 = _mm256_blend_epi32( Y2, X3, 0xc0 );
+   Y3 = _mm256_blend_epi32( Y3, X3, 0x03 );
+
+   B[0] = _mm256_add_epi32( B[0], Y0 );
+   B[1] = _mm256_add_epi32( B[1], Y1 );
+   B[2] = _mm256_add_epi32( B[2], Y2 );
+   B[3] = _mm256_add_epi32( B[3], Y3 );
+
+}
+
+// data format for 256 bits: 4 * ( 2 way 32 )
+// { l1d3, l0d3, l1d2, l0d2, l1d1, l0d1, l1d0, l0d0 }
+
+void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+         _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) );
+      salsa8_simd128_2way( &X[ 0], &X[16] );
+      salsa8_simd128_2way( &X[16], &X[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 2 J's
+      const uint32_t j0 = 32 * ( (uint32_t)( X[16]       ) & ( N-1 ) );
+      const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 )
+                 | ( V[ j0 + k ] & 0x00000000ffffffff ) );  
+
+      salsa8_simd128_2way( &X[ 0], &X[16] );
+      salsa8_simd128_2way( &X[16], &X[ 0] );
+   }
+}
+
+// Double buffered, 4x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_2buf( uint64_t *ba, uint64_t *bb, 
+                                      const uint64_t *ca, const uint64_t *cb )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   __m256i *BA = (__m256i*)ba; 
+   __m256i *BB = (__m256i*)bb; 
+   const __m256i *CA = (const __m256i*)ca;
+   const __m256i *CB = (const __m256i*)cb;
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm256_xor_si256( BA[0], CA[0] );
+   BB[0] = _mm256_xor_si256( BB[0], CB[0] );
+   BA[1] = _mm256_xor_si256( BA[1], CA[1] );
+   BB[1] = _mm256_xor_si256( BB[1], CB[1] );
+   BA[2] = _mm256_xor_si256( BA[2], CA[2] );
+   BB[2] = _mm256_xor_si256( BB[2], CB[2] );
+   BA[3] = _mm256_xor_si256( BA[3], CA[3] );
+   BB[3] = _mm256_xor_si256( BB[3], CB[3] );
+
+   YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x03 );
+   XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x30 );
+   XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x30 );
+   XA0 = _mm256_blend_epi32( XA0, YA0, 0x0f);
+   XB0 = _mm256_blend_epi32( XB0, YB0, 0x0f);
+
+   YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x03 );
+   XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x30 );
+   XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x30 );
+   XA1 = _mm256_blend_epi32( XA1, YA0, 0x0f );
+   XB1 = _mm256_blend_epi32( XB1, YB0, 0x0f );
+
+   YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x03 );
+   XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x30 );
+   XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x30 );
+   XA2 = _mm256_blend_epi32( XA2, YA0, 0x0f );
+   XB2 = _mm256_blend_epi32( XB2, YB0, 0x0f );
+
+   YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x03 );
+   YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x03 );
+   XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x30 );
+   XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x30 );
+   XA3 = _mm256_blend_epi32( XA3, YA0, 0x0f );
+   XB3 = _mm256_blend_epi32( XB3, YB0, 0x0f );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   YA0 = _mm256_blend_epi32( XA0, XA1, 0xc0 );
+   YB0 = _mm256_blend_epi32( XB0, XB1, 0xc0 );
+   YA1 = _mm256_blend_epi32( XA0, XA1, 0x03 );
+   YB1 = _mm256_blend_epi32( XB0, XB1, 0x03 );
+   YA2 = _mm256_blend_epi32( XA0, XA1, 0x0c );
+   YB2 = _mm256_blend_epi32( XB0, XB1, 0x0c );
+   YA3 = _mm256_blend_epi32( XA0, XA1, 0x30 );
+   YB3 = _mm256_blend_epi32( XB0, XB1, 0x30 );
+
+   YA0 = _mm256_blend_epi32( YA0, XA2, 0x30 );
+   YB0 = _mm256_blend_epi32( YB0, XB2, 0x30 );
+   YA1 = _mm256_blend_epi32( YA1, XA2, 0xc0 );
+   YB1 = _mm256_blend_epi32( YB1, XB2, 0xc0 );
+   YA2 = _mm256_blend_epi32( YA2, XA2, 0x03 );
+   YB2 = _mm256_blend_epi32( YB2, XB2, 0x03 );
+   YA3 = _mm256_blend_epi32( YA3, XA2, 0x0c );
+   YB3 = _mm256_blend_epi32( YB3, XB2, 0x0c );
+
+   YA0 = _mm256_blend_epi32( YA0, XA3, 0x0c );
+   YB0 = _mm256_blend_epi32( YB0, XB3, 0x0c );
+   YA1 = _mm256_blend_epi32( YA1, XA3, 0x30 );
+   YB1 = _mm256_blend_epi32( YB1, XB3, 0x30 );
+   YA2 = _mm256_blend_epi32( YA2, XA3, 0xc0 );
+   YB2 = _mm256_blend_epi32( YB2, XB3, 0xc0 );
+   YA3 = _mm256_blend_epi32( YA3, XA3, 0x03 );
+   YB3 = _mm256_blend_epi32( YB3, XB3, 0x03 );
+
+   BA[0] = _mm256_add_epi32( BA[0], YA0 );
+   BB[0] = _mm256_add_epi32( BB[0], YB0 );
+   BA[1] = _mm256_add_epi32( BA[1], YA1 );
+   BB[1] = _mm256_add_epi32( BB[1], YB1 );
+   BA[2] = _mm256_add_epi32( BA[2], YA2 );
+   BB[2] = _mm256_add_epi32( BB[2], YB2 );
+   BA[3] = _mm256_add_epi32( BA[3], YA3 );
+   BB[3] = _mm256_add_epi32( BB[3], YB3 );
+
+}
+
+void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
+
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) );
+      }
+      salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 4 J's
+      const uint32_t j0l = 32 * ( (const uint32_t)( X0[16]       ) & ( N-1 ) );
+      const uint32_t j0h = 32 * ( (const uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      const uint32_t j1l = 32 * ( (const uint32_t)( X1[16]       ) & ( N-1 ) );
+      const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+         
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+}
+
+// Working, deprecated, not up to date
+// Triple buffered 2 way, 6x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_3buf( uint64_t *BA, uint64_t *BB,
+          uint64_t *BC, const uint64_t *CA, const uint64_t *CB,
+          const uint64_t *CC )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3;
+   __m256i *ba = (__m256i*)BA;
+   __m256i *bb = (__m256i*)BB;
+   __m256i *bc = (__m256i*)BC;
+   const __m256i *ca = (const __m256i*)CA;
+   const __m256i *cb = (const __m256i*)CB;
+   const __m256i *cc = (const __m256i*)CC;
+   m256_ovly ya[4], yb[4], yc[4],
+             za[4], zb[4], zc[4];
+
+   // mix C into B then shuffle B into X
+   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
+   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
+   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
+   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
+   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
+   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
+   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
+   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
+   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
+   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
+   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
+   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
+
+   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
+   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
+   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
+   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
+   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
+   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
+   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
+   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
+   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
+   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
+   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
+   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   ya[0].m256 = XA0;    yb[0].m256 = XB0;
+   yc[0].m256 = XC0;
+   ya[1].m256 = XA1;    yb[1].m256 = XB1;
+   yc[1].m256 = XC1;
+   ya[2].m256 = XA2;    yb[2].m256 = XB2;
+   yc[2].m256 = XC2;
+   ya[3].m256 = XA3;    yb[3].m256 = XB3;
+   yc[3].m256 = XC3;
+
+   za[0].u64[0] = ya[0].u64[0];
+   zb[0].u64[0] = yb[0].u64[0];
+   zc[0].u64[0] = yc[0].u64[0];
+   za[0].u64[3] = ya[1].u64[0];
+   zb[0].u64[3] = yb[1].u64[0];
+   zc[0].u64[3] = yc[1].u64[0];
+   za[0].u64[2] = ya[2].u64[0];
+   zb[0].u64[2] = yb[2].u64[0];
+   zc[0].u64[2] = yc[2].u64[0];
+   za[0].u64[1] = ya[3].u64[0];
+   zb[0].u64[1] = yb[3].u64[0];
+   zc[0].u64[1] = yc[3].u64[0];
+
+   za[1].u64[1] = ya[0].u64[1];
+   zb[1].u64[1] = yb[0].u64[1];
+   zc[1].u64[1] = yc[0].u64[1];
+   za[1].u64[0] = ya[1].u64[1];
+   zb[1].u64[0] = yb[1].u64[1];
+   zc[1].u64[0] = yc[1].u64[1];
+   za[1].u64[3] = ya[2].u64[1];
+   zb[1].u64[3] = yb[2].u64[1];
+   zc[1].u64[3] = yc[2].u64[1];
+   za[1].u64[2] = ya[3].u64[1];
+   zb[1].u64[2] = yb[3].u64[1];
+   zc[1].u64[2] = yc[3].u64[1];
+
+   za[2].u64[2] = ya[0].u64[2];
+   zb[2].u64[2] = yb[0].u64[2];
+   zc[2].u64[2] = yc[0].u64[2];
+   za[2].u64[1] = ya[1].u64[2];
+   zb[2].u64[1] = yb[1].u64[2];
+   zc[2].u64[1] = yc[1].u64[2];
+   za[2].u64[0] = ya[2].u64[2];
+   zb[2].u64[0] = yb[2].u64[2];
+   zc[2].u64[0] = yc[2].u64[2];
+   za[2].u64[3] = ya[3].u64[2];
+   zb[2].u64[3] = yb[3].u64[2];
+   zc[2].u64[3] = yc[3].u64[2];
+
+   za[3].u64[3] = ya[0].u64[3];
+   zb[3].u64[3] = yb[0].u64[3];
+   zc[3].u64[3] = yc[0].u64[3];
+   za[3].u64[2] = ya[1].u64[3];
+   zb[3].u64[2] = yb[1].u64[3];
+   zc[3].u64[2] = yc[1].u64[3];
+   za[3].u64[1] = ya[2].u64[3];
+   zb[3].u64[1] = yb[2].u64[3];
+   zc[3].u64[1] = yc[2].u64[3];
+   za[3].u64[0] = ya[3].u64[3];
+   zb[3].u64[0] = yb[3].u64[3];
+   zc[3].u64[0] = yc[3].u64[3];
+
+   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
+   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
+   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
+   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
+   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
+   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
+   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
+   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
+   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
+   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
+   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
+   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
+}
+
+void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
+                                    const uint32_t N )
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *X2 = X+64;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+   uint64_t *V2 = V + 64*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 32], X0, 2*128 );
+      memcpy( &V1[i * 32], X1, 2*128 );
+      memcpy( &V2[i * 32], X2, 2*128 );
+      salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                                &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
+      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
+      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
+      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                                &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+}
+
+// Working, deprecated
+// 8x memory usage
+// 2x32 interleaving
+static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB,
+          uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB,
+          const uint64_t *CC, const uint64_t *CD )
+{
+   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+   __m256i *ba = (__m256i*)BA;
+   __m256i *bb = (__m256i*)BB;
+   __m256i *bc = (__m256i*)BC;
+   __m256i *bd = (__m256i*)BD;
+   const __m256i *ca = (const __m256i*)CA;
+   const __m256i *cb = (const __m256i*)CB;
+   const __m256i *cc = (const __m256i*)CC;
+   const __m256i *cd = (const __m256i*)CD;
+   m256_ovly ya[4], yb[4], yc[4], yd[4],
+             za[4], zb[4], zc[4], zd[4];
+
+   // mix C into B then shuffle B into X
+   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
+   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
+   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
+   bd[0] = _mm256_xor_si256( bd[0], cd[0] );
+   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
+   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
+   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
+   bd[1] = _mm256_xor_si256( bd[1], cd[1] );
+   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
+   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
+   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
+   bd[2] = _mm256_xor_si256( bd[2], cd[2] );
+   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
+   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
+   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
+   bd[3] = _mm256_xor_si256( bd[3], cd[3] );
+
+   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
+   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
+   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
+   XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] );
+   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
+   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
+   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
+   XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] );
+   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
+   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
+   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
+   XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] );
+   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
+   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
+   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
+   XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] );
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm256_shufll_64
+   #define ROR_1X32    mm256_shuflr_64
+   #define SWAP_64     mm256_swap_128
+   #define ROL32       mm256_rol_32
+   #define ADD32       _mm256_add_epi32
+   #define XOR         _mm256_xor_si256
+   #define TYPE        __m256i
+
+   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+   #undef TYPE
+
+   ya[0].m256 = XA0;    yb[0].m256 = XB0;
+   yc[0].m256 = XC0;    yd[0].m256 = XD0;
+   ya[1].m256 = XA1;    yb[1].m256 = XB1;
+   yc[1].m256 = XC1;    yd[1].m256 = XD1;
+   ya[2].m256 = XA2;    yb[2].m256 = XB2;
+   yc[2].m256 = XC2;    yd[2].m256 = XD2;
+   ya[3].m256 = XA3;    yb[3].m256 = XB3;
+   yc[3].m256 = XC3;    yd[3].m256 = XD3;
+
+   za[0].u64[0] = ya[0].u64[0];
+   zb[0].u64[0] = yb[0].u64[0];
+   zc[0].u64[0] = yc[0].u64[0];
+   zd[0].u64[0] = yd[0].u64[0];
+   za[0].u64[3] = ya[1].u64[0];
+   zb[0].u64[3] = yb[1].u64[0];
+   zc[0].u64[3] = yc[1].u64[0];
+   zd[0].u64[3] = yd[1].u64[0];
+   za[0].u64[2] = ya[2].u64[0];
+   zb[0].u64[2] = yb[2].u64[0];
+   zc[0].u64[2] = yc[2].u64[0];
+   zd[0].u64[2] = yd[2].u64[0];
+   za[0].u64[1] = ya[3].u64[0];
+   zb[0].u64[1] = yb[3].u64[0];
+   zc[0].u64[1] = yc[3].u64[0];
+   zd[0].u64[1] = yd[3].u64[0];
+
+   za[1].u64[1] = ya[0].u64[1];
+   zb[1].u64[1] = yb[0].u64[1];
+   zc[1].u64[1] = yc[0].u64[1];
+   zd[1].u64[1] = yd[0].u64[1];
+   za[1].u64[0] = ya[1].u64[1];
+   zb[1].u64[0] = yb[1].u64[1];
+   zc[1].u64[0] = yc[1].u64[1];
+   zd[1].u64[0] = yd[1].u64[1];
+   za[1].u64[3] = ya[2].u64[1];
+   zb[1].u64[3] = yb[2].u64[1];
+   zc[1].u64[3] = yc[2].u64[1];
+   zd[1].u64[3] = yd[2].u64[1];
+   za[1].u64[2] = ya[3].u64[1];
+   zb[1].u64[2] = yb[3].u64[1];
+   zc[1].u64[2] = yc[3].u64[1];
+   zd[1].u64[2] = yd[3].u64[1];
+
+   za[2].u64[2] = ya[0].u64[2];
+   zb[2].u64[2] = yb[0].u64[2];
+   zc[2].u64[2] = yc[0].u64[2];
+   zd[2].u64[2] = yd[0].u64[2];
+   za[2].u64[1] = ya[1].u64[2];
+   zb[2].u64[1] = yb[1].u64[2];
+   zc[2].u64[1] = yc[1].u64[2];
+   zd[2].u64[1] = yd[1].u64[2];
+   za[2].u64[0] = ya[2].u64[2];
+   zb[2].u64[0] = yb[2].u64[2];
+   zc[2].u64[0] = yc[2].u64[2];
+   zd[2].u64[0] = yd[2].u64[2];
+   za[2].u64[3] = ya[3].u64[2];
+   zb[2].u64[3] = yb[3].u64[2];
+   zc[2].u64[3] = yc[3].u64[2];
+   zd[2].u64[3] = yd[3].u64[2];
+
+   za[3].u64[3] = ya[0].u64[3];
+   zb[3].u64[3] = yb[0].u64[3];
+   zc[3].u64[3] = yc[0].u64[3];
+   zd[3].u64[3] = yd[0].u64[3];
+   za[3].u64[2] = ya[1].u64[3];
+   zb[3].u64[2] = yb[1].u64[3];
+   zc[3].u64[2] = yc[1].u64[3];
+   zd[3].u64[2] = yd[1].u64[3];
+   za[3].u64[1] = ya[2].u64[3];
+   zb[3].u64[1] = yb[2].u64[3];
+   zc[3].u64[1] = yc[2].u64[3];
+   zd[3].u64[1] = yd[2].u64[3];
+   za[3].u64[0] = ya[3].u64[3];
+   zb[3].u64[0] = yb[3].u64[3];
+   zc[3].u64[0] = yc[3].u64[3];
+   zd[3].u64[0] = yd[3].u64[3];
+
+   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
+   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
+   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
+   bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 );
+   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
+   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
+   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
+   bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 );
+   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
+   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
+   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
+   bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 );
+   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
+   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
+   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
+   bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 );
+}
+
+void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
+
+{
+   uint64_t *X0 = X;
+   uint64_t *X1 = X+32;
+   uint64_t *X2 = X+64;
+   uint64_t *X3 = X+96;
+   uint64_t *V0 = V;
+   uint64_t *V1 = V + 32*N;
+   uint64_t *V2 = V + 64*N;
+   uint64_t *V3 = V + 96*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V0[i * 32], X0, 2*128 );
+      memcpy( &V1[i * 32], X1, 2*128 );
+      memcpy( &V2[i * 32], X2, 2*128 );
+      memcpy( &V3[i * 32], X3, 2*128 );
+      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                                &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+      // need 4 J's
+      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
+      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
+      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
+      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
+      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
+      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
+      uint32_t j3l = 32 * ( (uint32_t)( X3[16]       ) & ( N-1 ) );
+      uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
+         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+         X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 )
+                  | ( V3[ j3l + k ] & 0x00000000ffffffff ) );
+      }
+      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                                &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+}
+   
+
+#endif  // AVX2
+
+#if defined(__SSE2__)  // required and assumed
+
+// Simple 4 way parallel.
+// Tested OK
+// Scyptn2 a little slower than pooler
+// Scrypt 2x faster than pooler
+// 4x memory usage
+// 4x32 interleaving
+static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
+{
+   __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
+   __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
+   __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
+   __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
+   __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
+   __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
+   __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
+   __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
+   __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
+   __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
+   __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
+   __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
+   __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
+   __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
+   __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
+   __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
+
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+   B[ 0] = _mm_add_epi32( B[ 0], x0 );
+   B[ 1] = _mm_add_epi32( B[ 1], x1 );
+   B[ 2] = _mm_add_epi32( B[ 2], x2 );
+   B[ 3] = _mm_add_epi32( B[ 3], x3 );
+   B[ 4] = _mm_add_epi32( B[ 4], x4 );
+   B[ 5] = _mm_add_epi32( B[ 5], x5 );
+   B[ 6] = _mm_add_epi32( B[ 6], x6 );
+   B[ 7] = _mm_add_epi32( B[ 7], x7 );
+   B[ 8] = _mm_add_epi32( B[ 8], x8 );
+   B[ 9] = _mm_add_epi32( B[ 9], x9 );
+   B[10] = _mm_add_epi32( B[10], xa );
+   B[11] = _mm_add_epi32( B[11], xb );
+   B[12] = _mm_add_epi32( B[12], xc );
+   B[13] = _mm_add_epi32( B[13], xd );
+   B[14] = _mm_add_epi32( B[14], xe );
+   B[15] = _mm_add_epi32( B[15], xf );
+}
+
+void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128*4 );
+      xor_salsa8_4way( &X[ 0], &X[16] );
+      xor_salsa8_4way( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      m128_ovly *vptr[4]; 
+      m128_ovly *x16 = (m128_ovly*)(&X[16]);
+
+      for ( int l = 0; l < 4; l++ )
+      {
+         uint32_t xl = (*x16).u32[l];
+         vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); 
+      }
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         m128_ovly v;    
+         for ( int l = 0; l < 4; l++ )
+            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
+         X[ k ] = _mm_xor_si128( X[ k ], v.m128 );
+      }
+
+      xor_salsa8_4way( &X[ 0], &X[16] );
+      xor_salsa8_4way( &X[16], &X[ 0] );
+   }
+}
+
+
+// Linear SIMD single thread. No memory increase but some shuffling overhead
+// required.
+
+// 4 way 32 bit interleaved single 32 bit thread, interleave while loading,
+// deinterleave while storing, do 2 way 128 & 4 way 128 parallel on top.
+//
+//   SALSA_2ROUNDS( {x0,x5,xa,xf}, {x4,x9,xe,x3}, {x8,xd,x2,x7}, {xc,x1,x6,xb})
+
+// Tested OK.
+// No interleaving
+static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
+{
+   __m128i X0, X1, X2, X3;
+   __m128i *B = (__m128i*)b;
+   const __m128i *C = (const __m128i*)c;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   
+   // mix C into B then shuffle B into X
+   B[0] = _mm_xor_si128( B[0], C[0] );
+   B[1] = _mm_xor_si128( B[1], C[1] );
+   B[2] = _mm_xor_si128( B[2], C[2] );
+   B[3] = _mm_xor_si128( B[3], C[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i Y0, Y1, Y2, Y3;
+
+#if defined(__AVX2__)
+   
+   Y0 = _mm_blend_epi32( B[1], B[0], 0x1 );
+   X0 = _mm_blend_epi32( B[3], B[2], 0x4 );
+   Y1 = _mm_blend_epi32( B[2], B[1], 0x1 );
+   X1 = _mm_blend_epi32( B[0], B[3], 0x4 );
+   Y2 = _mm_blend_epi32( B[3], B[2], 0x1 );
+   X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
+   Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
+   X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
+   X0 = _mm_blend_epi32( X0, Y0, 0x3);
+   X1 = _mm_blend_epi32( X1, Y1, 0x3 );
+   X2 = _mm_blend_epi32( X2, Y2, 0x3 );
+   X3 = _mm_blend_epi32( X3, Y3, 0x3 );
+
+#else // SSE4_1
+
+   Y0 = _mm_blend_epi16( B[1], B[0], 0x03 );
+   X0 = _mm_blend_epi16( B[3], B[2], 0x30 );
+   Y1 = _mm_blend_epi16( B[2], B[1], 0x03 );
+   X1 = _mm_blend_epi16( B[0], B[3], 0x30 );
+   Y2 = _mm_blend_epi16( B[3], B[2], 0x03 );
+   X2 = _mm_blend_epi16( B[1], B[0], 0x30 );
+   Y3 = _mm_blend_epi16( B[0], B[3], 0x03 );
+   X3 = _mm_blend_epi16( B[2], B[1], 0x30 );
+
+   X0 = _mm_blend_epi16( X0, Y0, 0x0f );
+   X1 = _mm_blend_epi16( X1, Y1, 0x0f );
+   X2 = _mm_blend_epi16( X2, Y2, 0x0f );
+   X3 = _mm_blend_epi16( X3, Y3, 0x0f );
+
+#endif // AVX2 else SSE4_1
+
+   SALSA_8ROUNDS_SIMD128;
+
+#if defined(__AVX2__)
+   
+   Y0 = _mm_blend_epi32( X0, X1, 0x8 );
+   Y1 = _mm_blend_epi32( X0, X1, 0x1 );
+   Y2 = _mm_blend_epi32( X0, X1, 0x2 );
+   Y3 = _mm_blend_epi32( X0, X1, 0x4 );
+
+   Y0 = _mm_blend_epi32( Y0, X2, 0x4 );
+   Y1 = _mm_blend_epi32( Y1, X2, 0x8 );
+   Y2 = _mm_blend_epi32( Y2, X2, 0x1 );
+   Y3 = _mm_blend_epi32( Y3, X2, 0x2 );
+
+   Y0 = _mm_blend_epi32( Y0, X3, 0x2 );
+   Y1 = _mm_blend_epi32( Y1, X3, 0x4 );
+   Y2 = _mm_blend_epi32( Y2, X3, 0x8 );
+   Y3 = _mm_blend_epi32( Y3, X3, 0x1 );
+
+#else  // SSE4_1
+
+   Y0 = _mm_blend_epi16( X0, X1, 0xc0 );
+   Y1 = _mm_blend_epi16( X0, X1, 0x03 );
+   Y2 = _mm_blend_epi16( X0, X1, 0x0c );
+   Y3 = _mm_blend_epi16( X0, X1, 0x30 );
+
+   Y0 = _mm_blend_epi16( Y0, X2, 0x30 );
+   Y1 = _mm_blend_epi16( Y1, X2, 0xc0 );
+   Y2 = _mm_blend_epi16( Y2, X2, 0x03 );
+   Y3 = _mm_blend_epi16( Y3, X2, 0x0c );
+
+   Y0 = _mm_blend_epi16( Y0, X3, 0x0c );
+   Y1 = _mm_blend_epi16( Y1, X3, 0x30 );
+   Y2 = _mm_blend_epi16( Y2, X3, 0xc0 );
+   Y3 = _mm_blend_epi16( Y3, X3, 0x03 );
+
+#endif   // AVX2 else SSE4_1
+
+   B[0] = _mm_add_epi32( B[0], Y0 );
+   B[1] = _mm_add_epi32( B[1], Y1 );
+   B[2] = _mm_add_epi32( B[2], Y2 );
+   B[3] = _mm_add_epi32( B[3], Y3 );
+
+#else  // SSE2
+
+   m128_ovly y[4], z[4];
+
+   X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
+   X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
+   X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
+   X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
+   
+   SALSA_8ROUNDS_FINAL_SIMD128;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+   // X0 is unchanged                    { xf, xa, x5, x0 }
+   // X1 is shuffled left 1 (rol_1x32)   { xe, x9, x4, x3 }
+   // X2 is shuffled left 2 (swap_64)    { xd, x8, x7, x2 }
+   // X3 is shuffled left 3 (ror_1x32)   { xc, xb, x6, x1 }
+
+   y[0].m128 = X0;
+   y[1].m128 = X1;
+   y[2].m128 = X2;
+   y[3].m128 = X3;
+
+   z[0].u32[0] = y[0].u32[0];
+   z[0].u32[3] = y[1].u32[0];
+   z[0].u32[2] = y[2].u32[0];
+   z[0].u32[1] = y[3].u32[0];
+
+   z[1].u32[1] = y[0].u32[1];
+   z[1].u32[0] = y[1].u32[1];
+   z[1].u32[3] = y[2].u32[1];
+   z[1].u32[2] = y[3].u32[1];
+
+   z[2].u32[2] = y[0].u32[2];
+   z[2].u32[1] = y[1].u32[2];
+   z[2].u32[0] = y[2].u32[2];
+   z[2].u32[3] = y[3].u32[2];
+
+   z[3].u32[3] = y[0].u32[3];
+   z[3].u32[2] = y[1].u32[3];
+   z[3].u32[1] = y[2].u32[3];
+   z[3].u32[0] = y[3].u32[3];
+
+   B[0] = _mm_add_epi32( B[0], z[0].m128 );
+   B[1] = _mm_add_epi32( B[1], z[1].m128 );
+   B[2] = _mm_add_epi32( B[2], z[2].m128 );
+   B[3] = _mm_add_epi32( B[3], z[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR 
+
+}
+
+void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+         _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) );
+
+      salsa8_simd128( &X[ 0], &X[16] );
+      salsa8_simd128( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      const int j = 32 * ( X[16] & ( N - 1 ) );
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= V[j + k];
+      salsa8_simd128( &X[ 0], &X[16] );
+      salsa8_simd128( &X[16], &X[ 0] );
+   }
+}
+
+// Double buffered, 2x memory usage
+// No interleaving
+static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
+                       const uint32_t * const ca, const uint32_t * const cb )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+
+#if defined(__AVX2__)
+   
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+
+#else // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+
+#endif  // AVX2 else SSE4_1
+
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+
+#else  // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+
+#endif // AVX2 else SSE4_1
+   
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], za[4], yb[4], zb[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   
+   SALSA_8ROUNDS_FINAL_SIMD128_2BUF;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+
+// X: 2 sequential buffers
+// V: 2 sequential buffers interleaved by the size of N
+// interleaved buffers { v00, v01, v10, v11, v20... }
+//
+void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+      }
+
+   #else
+
+      memcpy( &V0[ i*32 ], X0, 128 );
+      memcpy( &V1[ i*32 ], X1, 128 );
+
+   #endif
+
+      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N-1 ) );
+      const int j1 = 4 * ( X1[16] & ( N-1 ) );
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
+//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
+//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+      }
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N-1 ) );
+      const int j1 = 8 * ( X1[16] & ( N-1 ) );
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+      }
+
+   #endif
+
+/*      
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+      }
+*/
+
+/*
+      const int j0 = 32 * ( X0[16] & ( N-1 ) );
+      const int j1 = 32 * ( X1[16] & ( N-1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         const uint32_t v0 = V0[ j0+k ];
+         const uint32_t v1 = V1[ j1+k ]; 
+         X0[k] ^= v0;
+         X1[k] ^= v1;
+      }
+*/
+
+      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
+      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+   }
+}
+
+
+// Triple buffered, 3x memory usage
+// No interleaving
+static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   __m128i *BC = (__m128i*)bc;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+   const __m128i *CC = (const __m128i*)cc;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
+
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
+   
+#endif  // AVX2 else SSE3_1
+
+   SALSA_8ROUNDS_SIMD128_3BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+
+#endif  // AVX2 else SSE4_1
+
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BC[0] = _mm_add_epi32( BC[0], YC0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BC[1] = _mm_add_epi32( BC[1], YC1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BC[2] = _mm_add_epi32( BC[2], YC2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+   BC[3] = _mm_add_epi32( BC[3], YC3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
+
+   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
+
+   // Final round doesn't shuffle data back to original input order,
+   // process it as is.
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   yc[0].m128 = XC0;   
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;   
+   yc[1].m128 = XC1;   
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;   
+   yc[2].m128 = XC2;   
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+   yc[3].m128 = XC3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   zc[0].u32[0] = yc[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   zc[0].u32[3] = yc[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   zc[0].u32[2] = yc[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+   zc[0].u32[1] = yc[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   zc[1].u32[1] = yc[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   zc[1].u32[0] = yc[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   zc[1].u32[3] = yc[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+   zc[1].u32[2] = yc[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   zc[2].u32[2] = yc[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   zc[2].u32[1] = yc[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   zc[2].u32[0] = yc[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+   zc[2].u32[3] = yc[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   zc[3].u32[3] = yc[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   zc[3].u32[2] = yc[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   zc[3].u32[1] = yc[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+   zc[3].u32[0] = yc[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *X2 = X+64;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+  uint32_t *V2 = V + 64*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__) 
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
+         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+         _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) );
+      }
+
+   #else
+
+      memcpy( &V0[ i*32 ], X0, 128 );
+      memcpy( &V1[ i*32 ], X1, 128 );
+      memcpy( &V2[ i*32 ], X2, 128 );
+
+   #endif
+
+      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                           &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
+         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
+//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
+//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
+//         const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
+      }
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
+      for ( int k = 0; k < 8; k++ )
+      {
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
+      }
+
+   #endif
+
+/*      
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
+         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+         ( (uint64_t*)X2 )[k] ^= v2;
+      }
+*/      
+
+/*
+      const int j0 = 32 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 32 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 32 * ( X2[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 32; k++ )
+      {
+         const uint32_t v0 = V0[ j0+k ];
+         const uint32_t v1 = V1[ j1+k ];         
+         const uint32_t v2 = V2[ j2+k ];
+         X0[k] ^= v0;
+         X1[k] ^= v1;
+         X2[k] ^= v2;
+      }
+*/
+   
+      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
+                           &X0[16], &X1[16], &X2[16] );
+      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0] );
+   }
+}
+
+// Working.
+// Quadruple buffered, 4x memory usage
+// No interleaving
+static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+                     uint32_t *bd, const uint32_t *ca, const uint32_t *cb,
+                     const uint32_t *cc,  const uint32_t *cd )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   __m128i *BC = (__m128i*)bc;
+   __m128i *BD = (__m128i*)bd;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+   const __m128i *CC = (const __m128i*)cc;
+   const __m128i *CD = (const __m128i*)cd;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   // mix C into B then shuffle B into X
+   BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   BD[0] = _mm_xor_si128( BD[0], CD[0] );
+   BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   BD[1] = _mm_xor_si128( BD[1], CD[1] );
+   BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   BD[2] = _mm_xor_si128( BD[2], CD[2] );
+   BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   BD[3] = _mm_xor_si128( BD[3], CD[3] );
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3,
+           YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
+   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
+   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
+   YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 );
+   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
+   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
+   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
+   XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
+   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
+   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
+   YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 );
+   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
+   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
+   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
+   XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
+   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
+   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
+   YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 );
+   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
+   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
+   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
+   XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
+   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
+   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
+   YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 );
+   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
+   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
+   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
+   XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 );
+
+   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
+   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
+   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
+   XD0 = _mm_blend_epi32( XD0, YD0, 0x3 );
+
+   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
+   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
+   XD1 = _mm_blend_epi32( XD1, YD1, 0x3 );
+
+   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
+   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
+   XD2 = _mm_blend_epi32( XD2, YD2, 0x3 );
+
+   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
+   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+   XD3 = _mm_blend_epi32( XD3, YD3, 0x3 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
+   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
+   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
+   YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 );
+   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
+   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
+   XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 );
+   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
+   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
+   XD0 = _mm_blend_epi16( XD0, YD0, 0x0f );
+
+   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
+   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
+   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
+   YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 );
+   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
+   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
+   XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 );
+   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
+   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
+   XD1 = _mm_blend_epi16( XD1, YD1, 0x0f );
+
+   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
+   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
+   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
+   YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 );
+   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
+   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
+   XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 );
+   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
+   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
+   XD2 = _mm_blend_epi16( XD2, YD2, 0x0f );
+
+   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
+   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
+   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
+   YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 );
+   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
+   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
+   XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 );
+   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
+   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
+   XD3 = _mm_blend_epi16( XD3, YD3, 0x0f );
+
+#endif  // AVX2 else SSE3_1
+
+   SALSA_8ROUNDS_SIMD128_4BUF;
+
+#if defined(__AVX2__)
+
+   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
+   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
+   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
+   YD0 = _mm_blend_epi32( XD0, XD1, 0x8 );
+   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
+   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
+   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
+   YD1 = _mm_blend_epi32( XD0, XD1, 0x1 );
+   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
+   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
+   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
+   YD2 = _mm_blend_epi32( XD0, XD1, 0x2 );
+   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
+   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
+   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
+   YD3 = _mm_blend_epi32( XD0, XD1, 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
+   YD0 = _mm_blend_epi32( YD0, XD2, 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
+   YD1 = _mm_blend_epi32( YD1, XD2, 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
+   YD2 = _mm_blend_epi32( YD2, XD2, 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
+   YD3 = _mm_blend_epi32( YD3, XD2, 0x2 );
+
+   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
+   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
+   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
+   YD0 = _mm_blend_epi32( YD0, XD3, 0x2 );
+   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
+   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
+   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
+   YD1 = _mm_blend_epi32( YD1, XD3, 0x4 );
+   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
+   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
+   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
+   YD2 = _mm_blend_epi32( YD2, XD3, 0x8 );
+   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
+   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+   YD3 = _mm_blend_epi32( YD3, XD3, 0x1 );
+
+#else   // SSE4_1
+
+   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
+   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
+   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
+   YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 );
+   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
+   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
+   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
+   YD1 = _mm_blend_epi16( XD0, XD1, 0x03 );
+   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
+   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
+   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
+   YD2 = _mm_blend_epi16( XD0, XD1, 0x0c );
+   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
+   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
+   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
+   YD3 = _mm_blend_epi16( XD0, XD1, 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
+   YD0 = _mm_blend_epi16( YD0, XD2, 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
+   YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
+   YD2 = _mm_blend_epi16( YD2, XD2, 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
+   YD3 = _mm_blend_epi16( YD3, XD2, 0x0c );
+
+   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
+   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
+   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
+   YD0 = _mm_blend_epi16( YD0, XD3, 0x0c );
+   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
+   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
+   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
+   YD1 = _mm_blend_epi16( YD1, XD3, 0x30 );
+   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
+   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
+   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
+   YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 );
+   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
+   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+   YD3 = _mm_blend_epi16( YD3, XD3, 0x03 );
+
+#endif  // AVX2 else SSE4_1
+
+   BA[0] = _mm_add_epi32( BA[0], YA0 );
+   BB[0] = _mm_add_epi32( BB[0], YB0 );
+   BC[0] = _mm_add_epi32( BC[0], YC0 );
+   BD[0] = _mm_add_epi32( BD[0], YD0 );
+   BA[1] = _mm_add_epi32( BA[1], YA1 );
+   BB[1] = _mm_add_epi32( BB[1], YB1 );
+   BC[1] = _mm_add_epi32( BC[1], YC1 );
+   BD[1] = _mm_add_epi32( BD[1], YD1 );
+   BA[2] = _mm_add_epi32( BA[2], YA2 );
+   BB[2] = _mm_add_epi32( BB[2], YB2 );
+   BC[2] = _mm_add_epi32( BC[2], YC2 );
+   BD[2] = _mm_add_epi32( BD[2], YD2 );
+   BA[3] = _mm_add_epi32( BA[3], YA3 );
+   BB[3] = _mm_add_epi32( BB[3], YB3 );
+   BC[3] = _mm_add_epi32( BC[3], YC3 );
+   BD[3] = _mm_add_epi32( BD[3], YD3 );
+
+#else  // SSE2
+
+   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4];
+
+   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
+   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
+   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
+   XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] );
+   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
+   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
+   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
+   XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] );
+   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
+   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
+   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
+   XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] );
+   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
+   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
+   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
+   XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] );
+
+   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
+
+   ya[0].m128 = XA0;
+   yb[0].m128 = XB0;
+   yc[0].m128 = XC0;
+   yd[0].m128 = XD0;
+   ya[1].m128 = XA1;
+   yb[1].m128 = XB1;
+   yc[1].m128 = XC1;
+   yd[1].m128 = XD1;
+   ya[2].m128 = XA2;
+   yb[2].m128 = XB2;
+   yc[2].m128 = XC2;
+   yd[2].m128 = XD2;
+   ya[3].m128 = XA3;
+   yb[3].m128 = XB3;
+   yc[3].m128 = XC3;
+   yd[3].m128 = XD3;
+
+   za[0].u32[0] = ya[0].u32[0];
+   zb[0].u32[0] = yb[0].u32[0];
+   zc[0].u32[0] = yc[0].u32[0];
+   zd[0].u32[0] = yd[0].u32[0];
+   za[0].u32[3] = ya[1].u32[0];
+   zb[0].u32[3] = yb[1].u32[0];
+   zc[0].u32[3] = yc[1].u32[0];
+   zd[0].u32[3] = yd[1].u32[0];
+   za[0].u32[2] = ya[2].u32[0];
+   zb[0].u32[2] = yb[2].u32[0];
+   zc[0].u32[2] = yc[2].u32[0];
+   zd[0].u32[2] = yd[2].u32[0];
+   za[0].u32[1] = ya[3].u32[0];
+   zb[0].u32[1] = yb[3].u32[0];
+   zc[0].u32[1] = yc[3].u32[0];
+   zd[0].u32[1] = yd[3].u32[0];
+
+   za[1].u32[1] = ya[0].u32[1];
+   zb[1].u32[1] = yb[0].u32[1];
+   zc[1].u32[1] = yc[0].u32[1];
+   zd[1].u32[1] = yd[0].u32[1];
+   za[1].u32[0] = ya[1].u32[1];
+   zb[1].u32[0] = yb[1].u32[1];
+   zc[1].u32[0] = yc[1].u32[1];
+   zd[1].u32[0] = yd[1].u32[1];
+   za[1].u32[3] = ya[2].u32[1];
+   zb[1].u32[3] = yb[2].u32[1];
+   zc[1].u32[3] = yc[2].u32[1];
+   zd[1].u32[3] = yd[2].u32[1];
+   za[1].u32[2] = ya[3].u32[1];
+   zb[1].u32[2] = yb[3].u32[1];
+   zc[1].u32[2] = yc[3].u32[1];
+   zd[1].u32[2] = yd[3].u32[1];
+
+   za[2].u32[2] = ya[0].u32[2];
+   zb[2].u32[2] = yb[0].u32[2];
+   zc[2].u32[2] = yc[0].u32[2];
+   zd[2].u32[2] = yd[0].u32[2];
+   za[2].u32[1] = ya[1].u32[2];
+   zb[2].u32[1] = yb[1].u32[2];
+   zc[2].u32[1] = yc[1].u32[2];
+   zd[2].u32[1] = yd[1].u32[2];
+   za[2].u32[0] = ya[2].u32[2];
+   zb[2].u32[0] = yb[2].u32[2];
+   zc[2].u32[0] = yc[2].u32[2];
+   zd[2].u32[0] = yd[2].u32[2];
+   za[2].u32[3] = ya[3].u32[2];
+   zb[2].u32[3] = yb[3].u32[2];
+   zc[2].u32[3] = yc[3].u32[2];
+   zd[2].u32[3] = yd[3].u32[2];
+
+   za[3].u32[3] = ya[0].u32[3];
+   zb[3].u32[3] = yb[0].u32[3];
+   zc[3].u32[3] = yc[0].u32[3];
+   zd[3].u32[3] = yd[0].u32[3];
+   za[3].u32[2] = ya[1].u32[3];
+   zb[3].u32[2] = yb[1].u32[3];
+   zc[3].u32[2] = yc[1].u32[3];
+   zd[3].u32[2] = yd[1].u32[3];
+   za[3].u32[1] = ya[2].u32[3];
+   zb[3].u32[1] = yb[2].u32[3];
+   zc[3].u32[1] = yc[2].u32[3];
+   zd[3].u32[1] = yd[2].u32[3];
+   za[3].u32[0] = ya[3].u32[3];
+   zb[3].u32[0] = yb[3].u32[3];
+   zc[3].u32[0] = yc[3].u32[3];
+   zd[3].u32[0] = yd[3].u32[3];
+
+   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
+   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
+   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
+   BD[0] = _mm_add_epi32( BD[0], zd[0].m128 );
+   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
+   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
+   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
+   BD[1] = _mm_add_epi32( BD[1], zd[1].m128 );
+   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
+   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
+   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
+   BD[2] = _mm_add_epi32( BD[2], zd[2].m128 );
+   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
+   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
+   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
+   BD[3] = _mm_add_epi32( BD[3], zd[3].m128 );
+
+#endif
+
+   #undef ROL_1X32
+   #undef ROR_1X32
+   #undef SWAP_64
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+   #undef TYPE
+}
+
+void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+  uint32_t *X0 = X;
+  uint32_t *X1 = X+32;
+  uint32_t *X2 = X+64;
+  uint32_t *X3 = X+96;
+  uint32_t *V0 = V;
+  uint32_t *V1 = V + 32*N;
+  uint32_t *V2 = V + 64*N;
+  uint32_t *V3 = V + 96*N;
+
+   for ( int i = 0; i < N; i++ )
+   {
+      for ( int k = 0; k < 8; k++ )
+      {
+         _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) );
+         _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) );
+         _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) );
+         _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) );
+      }
+
+      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                           &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 4 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 4; k++ )
+      {
+         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
+         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); 
+         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
+         const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k );
+         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
+         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
+         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
+         casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 );
+      }
+
+   #else
+      
+      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 8 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 8; k++ )
+      {
+      #if defined(__SSE4_1__)
+         const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k );
+         const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k );
+      #else
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
+         const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k );
+      #endif
+         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
+         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
+         casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 );
+      }
+
+   #endif      
+
+/*
+      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
+      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
+      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
+      const int j3 = 16 * ( X3[16] & ( N - 1 ) );
+
+      for ( int k = 0; k < 16; k++ )
+      {
+         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
+         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];
+         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
+         const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ];
+         ( (uint64_t*)X0 )[k] ^= v0;
+         ( (uint64_t*)X1 )[k] ^= v1;
+         ( (uint64_t*)X2 )[k] ^= v2;
+         ( (uint64_t*)X3 )[k] ^= v3;
+      }
+*/
+
+      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
+                           &X0[16], &X1[16], &X2[16], &X3[16] );
+      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
+                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   }
+}
+
+
+#endif // SSE2
+
+
+// Reference, used only for testing.
+// Tested OK.
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+   uint32_t x0 = (B[ 0] ^= C[ 0]),
+            x1 = (B[ 1] ^= C[ 1]),
+            x2 = (B[ 2] ^= C[ 2]),
+            x3 = (B[ 3] ^= C[ 3]);
+   uint32_t x4 = (B[ 4] ^= C[ 4]),
+            x5 = (B[ 5] ^= C[ 5]),
+            x6 = (B[ 6] ^= C[ 6]),
+            x7 = (B[ 7] ^= C[ 7]);
+   uint32_t x8 = (B[ 8] ^= C[ 8]),
+            x9 = (B[ 9] ^= C[ 9]),
+            xa = (B[10] ^= C[10]),
+            xb = (B[11] ^= C[11]);
+   uint32_t xc = (B[12] ^= C[12]),
+            xd = (B[13] ^= C[13]),
+            xe = (B[14] ^= C[14]),
+            xf = (B[15] ^= C[15]);
+
+   
+   #define ROL32( a, c )    ror32( a, c )
+   #define ADD32( a, b )    ( (a)+(b) )
+   #define XOR( a, b )      ( (a)^(b) )
+
+   SALSA_8ROUNDS;
+
+   #undef ROL32
+   #undef ADD32
+   #undef XOR
+
+   B[ 0] += x0;
+   B[ 1] += x1;
+   B[ 2] += x2;
+   B[ 3] += x3;
+   B[ 4] += x4;
+   B[ 5] += x5;
+   B[ 6] += x6;
+   B[ 7] += x7;
+   B[ 8] += x8;
+   B[ 9] += x9;
+   B[10] += xa;
+   B[11] += xb;
+   B[12] += xc;
+   B[13] += xd;
+   B[14] += xe;
+   B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+
+
+void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N )
+{
+   for ( int i = 0; i < N; i++ )
+   {
+      memcpy( &V[i * 32], X, 128 );
+      xor_salsa8( &X[ 0], &X[16] );
+      xor_salsa8( &X[16], &X[ 0] );
+   }
+   for ( int i = 0; i < N; i++ )
+   {
+      int j = 32 * ( X[16] & ( N - 1 ) );
+      for ( int k = 0; k < 32; k++ )
+         X[k] ^= V[j + k];
+      xor_salsa8( &X[ 0], &X[16] );
+      xor_salsa8( &X[16], &X[ 0] );
+   }
+}
+
+
+
diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h
new file mode 100644
index 0000000..6567733
--- /dev/null
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -0,0 +1,70 @@
+#ifndef SCRYPT_CORE_4WAY_H__
+#define SCRYPT_CORE_4WAY_H__
+
+#include "simd-utils.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
+
+// Serial SIMD over 4 way parallel
+void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// 4 way parallel over serial SIMD
+void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
+
+#endif
+
+#if defined(__AVX2__)
+
+void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
+
+// 2 way parallel over SIMD128
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
+
+// Double buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Triplee buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Serial SIMD128 over 2 way parallel
+void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Double buffered simd over parallel
+void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Triple buffered 2 way
+void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Quadruple buffered
+void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+#endif
+
+#if defined(__SSE2__)
+
+// Parallel 4 way, 4x memory
+void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// Linear SIMD 1 way, 1x memory, lowest
+void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Double buffered, 2x memory
+void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Triple buffered
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Quadruple buffered, 4x memory
+void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif
+
+// For reference only
+void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif   
+
diff --git a/algo/scrypt/scrypt-core-ref.c b/algo/scrypt/scrypt-core-ref.c
new file mode 100644
index 0000000..ec564ed
--- /dev/null
+++ b/algo/scrypt/scrypt-core-ref.c
@@ -0,0 +1,206 @@
+#include "scrypt-core-ref.h"
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+   uint32_t x0 = (B[ 0] ^= C[ 0]),
+            x1 = (B[ 1] ^= C[ 1]),
+            x2 = (B[ 2] ^= C[ 2]),
+            x3 = (B[ 3] ^= C[ 3]);
+   uint32_t x4 = (B[ 4] ^= C[ 4]),
+            x5 = (B[ 5] ^= C[ 5]),
+            x6 = (B[ 6] ^= C[ 6]),
+            x7 = (B[ 7] ^= C[ 7]);
+   uint32_t x8 = (B[ 8] ^= C[ 8]),
+            x9 = (B[ 9] ^= C[ 9]),
+            xa = (B[10] ^= C[10]),
+            xb = (B[11] ^= C[11]);
+   uint32_t xc = (B[12] ^= C[12]),
+            xd = (B[13] ^= C[13]),
+            xe = (B[14] ^= C[14]),
+            xf = (B[15] ^= C[15]);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   B[ 0] += x0;
+   B[ 1] += x1;
+   B[ 2] += x2;
+   B[ 3] += x3;
+   B[ 4] += x4;
+   B[ 5] += x5;
+   B[ 6] += x6;
+   B[ 7] += x7;
+   B[ 8] += x8;
+   B[ 9] += x9;
+   B[10] += xa;
+   B[11] += xb;
+   B[12] += xc;
+   B[13] += xd;
+   B[14] += xe;
+   B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
+{
+   for (uint32_t i = 0; i < N; i++) {
+      memcpy(&V[i * 32], X, 128);
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+   for (uint32_t i = 0; i < N; i++) {
+      uint32_t j = 32 * (X[16] & (N - 1));
+      for (uint8_t k = 0; k < 32; k++)
+         X[k] ^= V[j + k];
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+}
+
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index e35adbf..a15b5cb 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -32,6 +32,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
+#include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sha256-hash.h"
+#include <mm_malloc.h>
 
 static const uint32_t keypad[12] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -46,81 +49,103 @@ static const uint32_t finalblk[16] = {
 	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
 };
 
-static __thread char *scratchbuf;
-int scratchbuf_size = 0;
+static const uint32_t sha256_initial_state[8] =
+{
+  0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+  0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static int scrypt_throughput = 0;
+
+static int scratchbuf_size = 0;
+
+static __thread char *scratchbuf = NULL;
+
+// change this to a constant to be used directly  as input state arg
+// vectors still need an init function.
+static inline void sha256_init_state( uint32_t *state )
+{
+   state[ 0 ] = 0x6A09E667;
+   state[ 1 ] = 0xBB67AE85;
+   state[ 2 ] = 0x3C6EF372;
+   state[ 3 ] = 0xA54FF53A;
+   state[ 4 ] = 0x510E527F;
+   state[ 5 ] = 0x9B05688C;
+   state[ 6 ] = 0x1F83D9AB;
+   state[ 7 ] = 0x5BE0CD19;
+}
 
 static inline void HMAC_SHA256_80_init(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
-	uint32_t ihash[8];
-	uint32_t pad[16];
-	int i;
+   uint32_t ihash[8];
+   uint32_t pad[16];
+   int i;
 
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 16, 16);
-	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 0);
-	memcpy(ihash, tstate, 32);
+   /* tstate is assumed to contain the midstate of key */
+   memcpy(pad, key + 16, 16);
+   memcpy(pad + 4, keypad, 48);
 
-	sha256_init(ostate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform(ostate, pad, 0);
+   sha256_transform_le( tstate, pad, tstate );
 
-	sha256_init(tstate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform(tstate, pad, 0);
+   memcpy( ihash, tstate, 32 );
+
+   for ( i = 0; i < 8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
+   for ( ; i < 16; i++ )      pad[i] = 0x5c5c5c5c;
+
+   sha256_transform_le( ostate, pad, sha256_initial_state );
+
+   for ( i = 0; i < 8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
+   for ( ; i < 16; i++ )      pad[i] = 0x36363636;
+
+   sha256_transform_le( tstate, pad, sha256_initial_state );
 }
 
 static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
-	uint32_t istate[8], ostate2[8];
-	uint32_t ibuf[16], obuf[16];
-	int i, j;
+   uint32_t istate[8], ostate2[8];
+   uint32_t ibuf[16], obuf[16];
+   int i, j;
 
-	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 16, 16);
-	memcpy(ibuf + 5, innerpad, 44);
-	memcpy(obuf + 8, outerpad, 32);
+   sha256_transform_le( istate, salt, tstate );
 
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 32);
-		ibuf[4] = i + 1;
-		sha256_transform(obuf, ibuf, 0);
+   memcpy(ibuf, salt + 16, 16);
+   memcpy(ibuf + 5, innerpad, 44);
+   memcpy(obuf + 8, outerpad, 32);
 
-		memcpy(ostate2, ostate, 32);
-		sha256_transform(ostate2, obuf, 0);
-		for (j = 0; j < 8; j++)
-			output[8 * i + j] = swab32(ostate2[j]);
-	}
+   for (i = 0; i < 4; i++)
+   {
+      memcpy(obuf, istate, 32);
+      ibuf[4] = i + 1;
+
+      sha256_transform_le( obuf, ibuf, obuf );
+      sha256_transform_le( ostate2, obuf, ostate );
+
+      for (j = 0; j < 8; j++)
+         output[8 * i + j] = bswap_32( ostate2[j] );
+   }
 }
 
 static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 	const uint32_t *salt, uint32_t *output)
 {
-	uint32_t buf[16];
-	int i;
-	
-	sha256_transform(tstate, salt, 1);
-	sha256_transform(tstate, salt + 16, 1);
-	sha256_transform(tstate, finalblk, 0);
-	memcpy(buf, tstate, 32);
-	memcpy(buf + 8, outerpad, 32);
+   uint32_t buf[16];
+   int i;
 
-	sha256_transform(ostate, buf, 0);
-	for (i = 0; i < 8; i++)
-		output[i] = swab32(ostate[i]);
+   sha256_transform_be( tstate, salt, tstate );
+   sha256_transform_be( tstate, salt+16, tstate );
+   sha256_transform_le( tstate, finalblk, tstate );
+
+   memcpy(buf, tstate, 32);
+   memcpy(buf + 8, outerpad, 32);
+
+   sha256_transform_le( ostate, buf, ostate );
+
+   for (i = 0; i < 8; i++)
+      output[i] = bswap_32( ostate[i] );
 }
 
-
 #ifdef HAVE_SHA256_4WAY
 
 static const uint32_t keypad_4way[4 * 12] = {
@@ -160,6 +185,8 @@ static const uint32_t outerpad_4way[4 * 8] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000300, 0x00000300, 0x00000300, 0x00000300
 };
+
+/*
 static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -178,37 +205,51 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
+*/
 
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
+static inline void sha256_4way_init_state( void *state )
+{
+   casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 );
+   casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 );
+   casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 );
+   casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A );
+   casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F );
+   casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C );
+   casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB );
+   casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
+                                   uint32_t *tstate, uint32_t *ostate )
 {
 	uint32_t _ALIGN(16) ihash[4 * 8];
 	uint32_t _ALIGN(16) pad[4 * 16];
 	int i;
 
 	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 4 * 16, 4 * 16);
-	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 0);
-	memcpy(ihash, tstate, 4 * 32);
+	memcpy( pad, key + 4*16, 4*16 );
+	memcpy( pad + 4*4, keypad_4way, 4*48 );
 
-	sha256_init_4way(ostate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_4way(ostate, pad, 0);
+   sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad,
+                             (const __m128i*)tstate );
 
-	sha256_init_4way(tstate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_4way(tstate, pad, 0);
+   sha256_4way_init_state( tstate );
+
+	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;
+
+   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad,
+                             (const __m128i*)tstate );
+   
+   for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
+	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;
+
+   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad,
+                             (const __m128i*)tstate );
 }
 
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
 	uint32_t _ALIGN(16) istate[4 * 8];
 	uint32_t _ALIGN(16) ostate2[4 * 8];
@@ -216,43 +257,62 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;
 
-	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 0);
+   sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt,
+                             (const __m128i*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
 	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
 
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 4 * 32);
+	for ( i = 0; i < 4; i++ )
+   {
 		ibuf[4 * 4 + 0] = i + 1;
 		ibuf[4 * 4 + 1] = i + 1;
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;
-		sha256_transform_4way(obuf, ibuf, 0);
 
-		memcpy(ostate2, ostate, 4 * 32);
-		sha256_transform_4way(ostate2, obuf, 0);
-		for (j = 0; j < 4 * 8; j++)
-			output[4 * 8 * i + j] = swab32(ostate2[j]);
+      sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf,
+                                (const __m128i*)istate );
+      
+      sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf,
+                                (const __m128i*)ostate );
+
+      for ( j = 0; j < 4 * 8; j++ )
+			output[4 * 8 * i + j] = bswap_32( ostate2[j] );
 	}
 }
 
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
+               uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-	uint32_t _ALIGN(16) buf[4 * 16];
+   __m128i _ALIGN(64) final[ 8*16 ];
+	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-	sha256_transform_4way(tstate, salt, 1);
-	sha256_transform_4way(tstate, salt + 4 * 16, 1);
-	sha256_transform_4way(tstate, finalblk_4way, 0);
-	memcpy(buf, tstate, 4 * 32);
+   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt,
+                       (const __m128i*)tstate );
+   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16),
+                       (const __m128i*)tstate );
+
+   final[ 0] = _mm_set1_epi32( 0x00000001 );
+   final[ 1] = _mm_set1_epi32( 0x80000000 );
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm_setzero_si128();
+   final[15] = _mm_set1_epi32 ( 0x00000620 );
+
+   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final,
+                       (const __m128i*)tstate );
+   
+   memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
 
-	sha256_transform_4way(ostate, buf, 0);
-	for (i = 0; i < 4 * 8; i++)
-		output[i] = swab32(ostate[i]);
+   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf,
+                             (const __m128i*)ostate );
+
+   for ( i = 0; i < 4 * 8; i++ )
+		output[i] = bswap_32( ostate[i] );
 }
 
 #endif /* HAVE_SHA256_4WAY */
@@ -260,6 +320,7 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
 
 #ifdef HAVE_SHA256_8WAY
 
+/*
 static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -278,41 +339,52 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
+*/
 
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
+static inline void sha256_8way_init_state( void *state )
+{
+   casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
+   casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
+   casti_m256i( state, 2 ) = _mm256_set1_epi32( 0x3C6EF372 );
+   casti_m256i( state, 3 ) = _mm256_set1_epi32( 0xA54FF53A );
+   casti_m256i( state, 4 ) = _mm256_set1_epi32( 0x510E527F );
+   casti_m256i( state, 5 ) = _mm256_set1_epi32( 0x9B05688C );
+   casti_m256i( state, 6 ) = _mm256_set1_epi32( 0x1F83D9AB );
+   casti_m256i( state, 7 ) = _mm256_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
+                                      uint32_t *tstate, uint32_t *ostate )
 {
 	uint32_t _ALIGN(32) ihash[8 * 8];
 	uint32_t _ALIGN(32)  pad[8 * 16];
 	int i;
 	
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		pad[8 * 4 + i] = 0x80000000;
-	memset(pad + 8 * 5, 0x00, 8 * 40);
-	for (i = 0; i < 8; i++)
-		pad[8 * 15 + i] = 0x00000280;
-	sha256_transform_8way(tstate, pad, 0);
-	memcpy(ihash, tstate, 8 * 32);
-	
-	sha256_init_8way(ostate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_8way(ostate, pad, 0);
-	
-	sha256_init_8way(tstate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_8way(tstate, pad, 0);
+	memcpy( pad, key + 8*16, 8*16 );
+	for ( i = 0; i < 8; i++ )    pad[ 8*4 + i ] = 0x80000000;
+	memset( pad + 8*5, 0x00, 8*40 );
+	for ( i = 0; i < 8; i++ )    pad[ 8*15 + i ] = 0x00000280;
+
+   sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
+                             (const __m256i*)tstate );
+
+   sha256_8way_init_state( tstate );
+
+   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for ( ; i < 8*16; i++ )       pad[i] = 0x5c5c5c5c;
+
+   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
+                             (const __m256i*)tstate );
+
+   for ( i = 0; i < 8*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
+	for ( ; i < 8*16; i++ )       pad[i] = 0x36363636;
+
+   sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
+                             (const __m256i*)tstate );
 }
 
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
 	uint32_t _ALIGN(32) istate[8 * 8];
 	uint32_t _ALIGN(32) ostate2[8 * 8];
@@ -320,24 +392,20 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
 	uint32_t _ALIGN(32) obuf[8 * 16];
 	int i, j;
 	
-	memcpy(istate, tstate, 8 * 32);
-	sha256_transform_8way(istate, salt, 0);
+   sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
+                             (const __m256i*)tstate );
+
+	memcpy( ibuf, salt + 8*16, 8*16 );
+	for ( i = 0; i < 8; i++ )     ibuf[ 8*5 + i ] = 0x80000000;
+	memset( ibuf + 8*6, 0x00, 8*36 );
+	for ( i = 0; i < 8; i++ )     ibuf[ 8*15 + i ] = 0x000004a0;
 	
-	memcpy(ibuf, salt + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 5 + i] = 0x80000000;
-	memset(ibuf + 8 * 6, 0x00, 8 * 36);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 15 + i] = 0x000004a0;
+	for ( i = 0; i < 8; i++ )     obuf[ 8*8 + i ] = 0x80000000;
+	memset( obuf + 8*9, 0x00, 8*24 );
+	for ( i = 0; i < 8; i++ )     obuf[ 8*15 + i ] = 0x00000300;
 	
-	for (i = 0; i < 8; i++)
-		obuf[8 * 8 + i] = 0x80000000;
-	memset(obuf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		obuf[8 * 15 + i] = 0x00000300;
-	
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 8 * 32);
+	for ( i = 0; i < 4; i++ )
+   {
 		ibuf[8 * 4 + 0] = i + 1;
 		ibuf[8 * 4 + 1] = i + 1;
 		ibuf[8 * 4 + 2] = i + 1;
@@ -346,48 +414,198 @@ static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
 		ibuf[8 * 4 + 5] = i + 1;
 		ibuf[8 * 4 + 6] = i + 1;
 		ibuf[8 * 4 + 7] = i + 1;
-		sha256_transform_8way(obuf, ibuf, 0);
-		
-		memcpy(ostate2, ostate, 8 * 32);
-		sha256_transform_8way(ostate2, obuf, 0);
-		for (j = 0; j < 8 * 8; j++)
-			output[8 * 8 * i + j] = swab32(ostate2[j]);
+
+      sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
+                                (const __m256i*)istate );
+
+      sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
+                                (const __m256i*)ostate );
+
+      for ( j = 0; j < 8*8; j++ )
+			output[ 8*8*i + j ] = bswap_32( ostate2[j] );
 	}
 }
 
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
+                uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-	uint32_t _ALIGN(32) buf[8 * 16];
+   __m256i _ALIGN(128) final[ 8*16 ];
+   uint32_t _ALIGN(128) buf[ 8*16 ];
 	int i;
 	
-	sha256_transform_8way(tstate, salt, 1);
-	sha256_transform_8way(tstate, salt + 8 * 16, 1);
-	sha256_transform_8way(tstate, finalblk_8way, 0);
-	
-	memcpy(buf, tstate, 8 * 32);
-	for (i = 0; i < 8; i++)
-		buf[8 * 8 + i] = 0x80000000;
-	memset(buf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		buf[8 * 15 + i] = 0x00000300;
-	sha256_transform_8way(ostate, buf, 0);
-	
+   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
+                             (const __m256i*)tstate );
+   sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
+                             (const __m256i*)tstate );
+   
+   final[ 0] = _mm256_set1_epi32( 0x00000001 );
+   final[ 1] = _mm256_set1_epi32( 0x80000000 ); 
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm256_setzero_si256();
+   final[15] = _mm256_set1_epi32 ( 0x00000620 );
+
+   sha256_8way_transform_le( (__m256i*)tstate, final,
+                             (const __m256i*)tstate );
+
+	memcpy( buf, tstate, 8*32 );
+	for ( i = 0; i < 8; i++ )     buf[ 8*8 + i ] = 0x80000000;
+	memset( buf + 8*9, 0x00, 8*24 );
+	for ( i = 0; i < 8; i++ )     buf[ 8*15 + i ] = 0x00000300;
+
+   sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
+                             (const __m256i*)ostate );
+
 	for (i = 0; i < 8 * 8; i++)
-		output[i] = swab32(ostate[i]);
+		output[i] = bswap_32(ostate[i]);
 }
 
 #endif /* HAVE_SHA256_8WAY */
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+static inline void sha256_16way_init_state( void *state )
+{
+   casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
+   casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
+   casti_m512i( state, 2 ) = _mm512_set1_epi32( 0x3C6EF372 );
+   casti_m512i( state, 3 ) = _mm512_set1_epi32( 0xA54FF53A );
+   casti_m512i( state, 4 ) = _mm512_set1_epi32( 0x510E527F );
+   casti_m512i( state, 5 ) = _mm512_set1_epi32( 0x9B05688C );
+   casti_m512i( state, 6 ) = _mm512_set1_epi32( 0x1F83D9AB );
+   casti_m512i( state, 7 ) = _mm512_set1_epi32( 0x5BE0CD19 );
+}
+
+static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
+                                     uint32_t *tstate, uint32_t *ostate )
+{
+   uint32_t _ALIGN(128)   pad[16*16];
+   uint32_t _ALIGN(128) ihash[16* 8];
+   int i;
+
+   memcpy( pad, key + 16*16, 16*16 ); 
+   for ( i = 0; i < 16; i++ )       pad[ 16*4 + i ] = 0x80000000;
+   memset( pad + 16*5, 0x00, 16*40 );
+   for ( i = 0; i < 16; i++ )       pad[ 16*15 + i ] = 0x00000280;
+
+   sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
+                              (const __m512i*)tstate );
+
+   sha256_16way_init_state( tstate );
+
+   for ( i = 0; i < 16*8; i++ )    pad[i] = ihash[i] ^ 0x5c5c5c5c;
+   for ( ; i < 16*16; i++ )        pad[i] = 0x5c5c5c5c;
+
+   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
+                              (const __m512i*)tstate );
+
+   for ( i = 0; i < 16*8; i++ )   pad[i] = ihash[i] ^ 0x36363636;
+   for ( ; i < 16*16; i++ )       pad[i] = 0x36363636;
+ 
+   sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
+                              (const __m512i*)tstate );
+}
+
+
+static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
+          const uint32_t *ostate, const uint32_t *salt, uint32_t *output )
+{
+   uint32_t _ALIGN(128) ibuf[ 16*16 ];
+   uint32_t _ALIGN(128) obuf[ 16*16 ];
+   uint32_t _ALIGN(128) istate[ 16*8 ];
+   uint32_t _ALIGN(128) ostate2[ 16*8 ];
+   int i, j;
+
+   sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
+                              (const __m512i*)tstate );
+
+   memcpy( ibuf, salt + 16*16, 16*16 );
+   for ( i = 0; i < 16; i++ )      ibuf[ 16*5 + i ] = 0x80000000;
+   memset( ibuf + 16*6, 0x00, 16*36 );
+   for ( i = 0; i < 16; i++ )      ibuf[ 16*15 + i ] = 0x000004a0;
+
+   for ( i = 0; i < 16; i++ )      obuf[ 16*8 + i ] = 0x80000000;
+   memset( obuf + 16*9, 0x00, 16*24 );
+   for ( i = 0; i < 16; i++ )      obuf[ 16*15 + i ] = 0x00000300;
+
+   for ( i = 0; i < 4; i++ )
+   {
+      ibuf[ 16*4 +  0 ] = i + 1;
+      ibuf[ 16*4 +  1 ] = i + 1;
+      ibuf[ 16*4 +  2 ] = i + 1;
+      ibuf[ 16*4 +  3 ] = i + 1;
+      ibuf[ 16*4 +  4 ] = i + 1;
+      ibuf[ 16*4 +  5 ] = i + 1;
+      ibuf[ 16*4 +  6 ] = i + 1;
+      ibuf[ 16*4 +  7 ] = i + 1;
+      ibuf[ 16*4 +  8 ] = i + 1;
+      ibuf[ 16*4 +  9 ] = i + 1;
+      ibuf[ 16*4 + 10 ] = i + 1;
+      ibuf[ 16*4 + 11 ] = i + 1;
+      ibuf[ 16*4 + 12 ] = i + 1;
+      ibuf[ 16*4 + 13 ] = i + 1;
+      ibuf[ 16*4 + 14 ] = i + 1;
+      ibuf[ 16*4 + 15 ] = i + 1;
+
+      sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
+                                 (const __m512i*)istate );
+
+      sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
+                                 (const __m512i*)ostate );
+
+      for ( j = 0; j < 16*8; j++ )
+         output[ 16*8*i + j ] = bswap_32( ostate2[j] );
+   }
+}
+
+static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
+                 uint32_t *ostate, const uint32_t *salt, uint32_t *output )
+{
+   __m512i _ALIGN(128) final[ 16*16 ];
+   uint32_t _ALIGN(128) buf[ 16*16 ];
+   int i;
+
+   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
+                             (const __m512i*)tstate );
+   sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
+                             (const __m512i*)tstate );
+
+   final[ 0] = _mm512_set1_epi32( 0x00000001 );
+   final[ 1] = _mm512_set1_epi32( 0x80000000 );
+   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
+             = final[ 7] = final[ 8] = final[ 9] = final[10]
+             = final[11] = final[12] = final[13] = final[14]
+             = _mm512_setzero_si512();
+   final[15] = _mm512_set1_epi32 ( 0x00000620 );
+
+   sha256_16way_transform_le( (__m512i*)tstate, final,
+                             (const __m512i*)tstate );
+
+   memcpy( buf, tstate, 16*32 );
+   for ( i = 0; i < 16; i++ )      buf[ 16*8 + i ] = 0x80000000;
+   memset( buf + 16*9, 0x00, 16*24 );
+   for ( i = 0; i < 16; i++ )      buf[ 16*15 + i ] = 0x00000300;
+
+   sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
+                             (const __m512i*)ostate );
+
+   for ( i = 0; i < 16*8; i++ )
+      output[i] = bswap_32( ostate[i] );
+}
+
+#endif // AVX512
 
 //#if defined(USE_ASM) && defined(__x86_64__)
 
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
-int scrypt_best_throughput();
+//int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V, int N);
 void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
-#if defined(USE_AVX2)
+
+//#if defined(USE_AVX2)
+#if defined(__AVX2__)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
 #define HAVE_SCRYPT_6WAY 1
@@ -396,261 +614,633 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
+//#define scrypt_best_throughput() 1
 #endif
 
-unsigned char *scrypt_buffer_alloc(int N)
-{
-	return (uchar*) malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
-}
+#include "scrypt-core-4way.h"
 
-static bool scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
+static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 	uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
-	uint32_t *V;
+	uint32_t *V = (uint32_t*)scratchpad;
 	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
 
-	scrypt_core(X, V, N);
+   scrypt_core_simd128( X, V, N );  // woring
+//   scrypt_core_1way( X, V, N );  // working
+//   scrypt_core(X, V, N);
 
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
    return true;
 }
 
-#ifdef HAVE_SHA256_4WAY
-static int scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, 
-   int thrid )
+#if defined(__AVX2__)
+
+static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-	uint32_t _ALIGN(128) tstate[4 * 8];
-	uint32_t _ALIGN(128) ostate[4 * 8];
-	uint32_t _ALIGN(128) W[4 * 32];
-	uint32_t _ALIGN(128) X[4 * 32];
-	uint32_t *V;
-	int i, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+   uint32_t _ALIGN(128) tstate[8 * 8];
+   uint32_t _ALIGN(128) ostate[8 * 8];
+   uint32_t _ALIGN(128) W[8 * 32];
+   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t *V = (uint32_t*)scratchpad;
 
-	for (i = 0; i < 20; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = input[k * 20 + i];
+   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
+                   input+80, input+100, input+120, input+140, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m256i( tstate, i ) = _mm256_set1_epi32( midstate[i] );
 
-   for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			tstate[4 * i + k] = midstate[i];
+   HMAC_SHA256_80_init_8way( W, tstate, ostate );
+   PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
 
-   HMAC_SHA256_80_init_4way(W, tstate, ostate);
+   dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
+
+
+   // SCRYPT CORE
+
+
+   // AVX512
+
+/*
+   // AVX512 16 way working
+   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                    X+256+160, X+256+192, X+256+224, 1024 );
+
+   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
+
+   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
+                  X+256+160, X+256+192, X+256+224, W, 1024 );
+*/
+/*
+   // AVX512 working
+   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/   
+/*
+   // AVX512, not working, very slow
+   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
+
+  // AVX2
+
+/*
+   // AVX2   
+   // disable de/interleave for testing.
+   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+*/
+
+/*
+   // AVX2 working
+   intrlv_2x128( W,     X,     X+ 32, 1024 );
+   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+   intrlv_2x128( W+128, X+128, X+160, 1024 );
+   intrlv_2x128( W+192, X+192, X+224, 1024 );
+
+   // working
+//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+
+   // working
+   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+
+   // working
+//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+
+   dintrlv_2x128( X,     X+ 32, W,     1024 );
+   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x128( X+128, X+160, W+128, 1024 );
+   dintrlv_2x128( X+192, X+224, W+192, 1024 );
+*/
+
+/* 
+   // AVX2
+   intrlv_2x32( W,     X    , X+ 32, 1024 );
+   intrlv_2x32( W+64,  X+ 64, X+ 96, 1024 );
+   intrlv_2x32( W+128, X+128, X+160, 1024 );
+   intrlv_2x32( W+192, X+192, X+224, 1024 );
+
+   // working, deprecated, not up to data
+//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
+
+     // deprecated, not up to date
+//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   // working
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+
+//   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
+   
+   dintrlv_2x32( X,     X+ 32, W,     1024 );
+   dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x32( X+128, X+160, W+128, 1024 );
+   dintrlv_2x32( X+192, X+224, W+192, 1024 );
+*/   
+
+   // SSE2
+
+/*   
+   // SSE2 working
+   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
+   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N ); 
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); 
+   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
+*/
+
+/*
+   // SSE2
+   scrypt_core_simd128( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+160, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+224, V, N );
+*/
+/*
+   // SSE2 working
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+*/
+
+   scrypt_core_simd128_3buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+
+/*
+   // SSE2 working
+   scrypt_core_simd128_4buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+128, V, N );
+*/
 
-   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
 
    if ( work_restart[thrid].restart ) return 0;
-   
-   for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			X[k * 32 + i] = W[4 * i + k];
 
+   intrlv_8x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, 1024 );
+
+   PBKDF2_SHA256_128_32_8way( tstate, ostate, W, W );
+
+   dintrlv_8x32( output,    output+ 8, output+16, output+24,
+                 output+32, output+40, output+48, output+56, W, 256 );
+
+   return 1;
+}
+
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+   uint32_t _ALIGN(128) tstate[ 16*8 ];
+   uint32_t _ALIGN(128) ostate[ 16*8 ];
+   uint32_t _ALIGN(128) W[ 16*32 ]; 
+   uint32_t _ALIGN(128) X[ 16*32 ];
+   uint32_t *V = (uint32_t*)scratchpad;
+
+   intrlv_16x32( W, input,     input+ 20, input+ 40, input+ 60,
+                    input+ 80, input+100, input+120, input+140,
+                    input+160, input+180, input+200, input+220,
+                    input+240, input+260, input+280, input+300, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m512i( tstate, i ) = _mm512_set1_epi32( midstate[i] );
+
+   HMAC_SHA256_80_init_16way( W, tstate, ostate );
+   PBKDF2_SHA256_80_128_16way( tstate, ostate, W, W );
+
+   dintrlv_16x32( X,     X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224,
+                  X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480,
+                  W, 1024 );
+
+
+   // SCRYPT CORE
+
+
+   // AVX512
+/*
+   // AVX512 16 way working
+   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                    X+256+160, X+256+192, X+256+224, 1024 );
+
+   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
+
+   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
+                  X+256, X+256+32, X+256+64, X+256+96, X+256+128,
+                  X+256+160, X+256+192, X+256+224, W, 1024 );
+*/
+/*
+   // AVX512 working
+   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
+/*
+   // AVX512, not working, very slow
+   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+*/
+
+  // AVX2
+
+/*
+   // AVX2
+   // disable de/interleave for testing.
+   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+*/
+
+/*
+   // AVX2 working
+   intrlv_2x128( W,     X,     X+ 32, 1024 );
+   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+   intrlv_2x128( W+128, X+128, X+160, 1024 );
+   intrlv_2x128( W+192, X+192, X+224, 1024 );
+
+   // working
+//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+
+   // working
+   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+
+   // working
+//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+
+   dintrlv_2x128( X,     X+ 32, W,     1024 );
+   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x128( X+128, X+160, W+128, 1024 );
+   dintrlv_2x128( X+192, X+224, W+192, 1024 );
+*/
+
+/*
+   // AVX2
+   intrlv_2x32( W,     X    , X+ 32, 1024 );
+   intrlv_2x32( W+64,  X+ 64, X+ 96, 1024 );
+   intrlv_2x32( W+128, X+128, X+160, 1024 );
+   intrlv_2x32( W+192, X+192, X+224, 1024 );
+
+   // working, deprecated, not up to data
+//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
+
+     // deprecated, not up to date
+//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   // working
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+
+//   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+ 64 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N );
+
+   dintrlv_2x32( X,     X+ 32, W,     1024 );
+   dintrlv_2x32( X+ 64, X+ 96, W+ 64, 1024 );
+   dintrlv_2x32( X+128, X+160, W+128, 1024 );
+   dintrlv_2x32( X+192, X+224, W+192, 1024 );
+*/
+
+   // SSE2
+
+/*
+   // SSE2 working
+   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
+   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
+   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
+   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
+*/
+/*
+   // SSE2
+   scrypt_core_simd128( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+160, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+224, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+288, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+320, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+384, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+416, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+448, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+480, V, N );
+*/
+/*
+   // SSE2 working
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+320, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+384, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+*/
+
+   scrypt_core_simd128_3buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_3buf( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+
+/*
+   // SSE2 working
+   scrypt_core_simd128_4buf( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+128, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4buf( X+384, V, N );
+*/
+/*
+   scrypt_core_3way( X,     V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+ 96, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+192, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+256, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_3way( X+352, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+448, V, N );
+*/
+
+
+   if ( work_restart[thrid].restart ) return 0;
+
+   intrlv_16x32( W, X,     X+ 32, X+ 64, X+ 96, X+128, X+160, X+192, X+224,
+                    X+256, X+288, X+320, X+352, X+384, X+416, X+448, X+480,
+                    1024 );
+
+   PBKDF2_SHA256_128_32_16way( tstate, ostate, W, W );
+
+   dintrlv_16x32( output,     output+  8, output+ 16, output+ 24,
+                  output+ 32, output+ 40, output+ 48, output+ 56,
+                  output+ 64, output+ 72, output+ 80, output+ 88,
+                  output+ 96, output+104, output+112, output+120, W, 256 );
+
+   return 1;
+}
+
+
+#endif // AVX512
+
+#if defined(__SHA__)
+
+static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+    uint32_t _ALIGN(128) tstate[4 * 8];
+    uint32_t _ALIGN(128) ostate[4 * 8];
+    uint32_t _ALIGN(128) W[4 * 32];
+    uint32_t *V = (uint32_t*)scratchpad;
+
+    memcpy( tstate,    midstate, 32 );
+    memcpy( tstate+ 8, midstate, 32 );
+    memcpy( tstate+16, midstate, 32 );
+    memcpy( tstate+24, midstate, 32 );
+    
+    HMAC_SHA256_80_init(  input,     tstate,    ostate    );
+    PBKDF2_SHA256_80_128( tstate,    ostate,    input,     W );
+
+    HMAC_SHA256_80_init(  input +20, tstate+ 8, ostate+ 8 );
+    PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 );
+
+    HMAC_SHA256_80_init(  input +40, tstate+16, ostate+16 );
+    PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
+
+    HMAC_SHA256_80_init(  input +60, tstate+24, ostate+24 );
+    PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
+
+/*    
+   // Working Linear single threaded SIMD
+   scrypt_core_simd128( W,    V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+96, V, N );
+*/
+
+   // working, double buffered linear simd
+   scrypt_core_simd128_2buf( W, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( W+64, V, N );
+
+/*
+   scrypt_core_simd128_3buf( W, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( W+96, V, N );
+*/
+
+   // working
+//   scrypt_core_simd128_4buf( W, V, N );
+
+   if ( work_restart[thrid].restart ) return 0;
+
+   PBKDF2_SHA256_128_32( tstate,    ostate,    W,    output    );
+
+   PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
+
+   PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
+
+   PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
+
+   return 1;
+}
+
+#else
+
+#ifdef HAVE_SHA256_4WAY
+static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+   uint32_t _ALIGN(128) tstate[4 * 8];
+   uint32_t _ALIGN(128) ostate[4 * 8];
+   uint32_t _ALIGN(128) W[4 * 32];
+   uint32_t _ALIGN(128) X[4 * 32];
+   uint32_t *V = (uint32_t*)scratchpad;
+
+   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
+   for ( int i = 0; i < 8; i++ )
+      casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] );
+
+   HMAC_SHA256_80_init_4way(W, tstate, ostate);
+   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
+
+   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+
+////// SCRYPT_CORE   
+
+   
+   // working, simple 4 way parallel, best for scrypt
+//   scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+
+/*   
+   // Working Linear single threaded SIMD
+   scrypt_core_simd128( X,    V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+32, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+64, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+96, V, N );
+*/
+   
+   // working, double buffered linear simd, best for n2
+   scrypt_core_simd128_2buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2buf( X+64, V, N );
+  
+/*
+   scrypt_core_simd128_3buf( X, V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128( X+96, V, N );
+*/
+   
+   // working
+//   scrypt_core_simd128_4buf( X, V, N );
+
+
+/* 
+   // original
    scrypt_core(X + 0 * 32, V, N);
 	scrypt_core(X + 1 * 32, V, N);
 	scrypt_core(X + 2 * 32, V, N);
 	scrypt_core(X + 3 * 32, V, N);
+*/
+
+////////////////////////////////
 
    if ( work_restart[thrid].restart ) return 0;
 
-   for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = X[k * 32 + i];
+   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
 
    PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 
-   for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			output[k * 8 + i] = W[4 * i + k];
+   dintrlv_4x32( output, output+8, output+16, output+24, W, 256 );
 
    return 1;
 }
 #endif /* HAVE_SHA256_4WAY */
 
-#ifdef HAVE_SCRYPT_3WAY
-
-static int scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, 
-   int thrid )
-{
-	uint32_t _ALIGN(64) tstate[3 * 8], ostate[3 * 8];
-	uint32_t _ALIGN(64) X[3 * 32];
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate +  0, midstate, 32);
-	memcpy(tstate +  8, midstate, 32);
-	memcpy(tstate + 16, midstate, 32);
-
-   HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
-	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
-	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
-	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   scrypt_core_3way(X, V, N);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
-	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
-	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
-
-   return 1;
-}
-
-#ifdef HAVE_SHA256_4WAY
-static bool scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N,
-   int thrid )
-{
-	uint32_t _ALIGN(128) tstate[12 * 8];
-	uint32_t _ALIGN(128) ostate[12 * 8];
-	uint32_t _ALIGN(128) W[12 * 32];
-	uint32_t _ALIGN(128) X[12 * 32];
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
-
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				tstate[32 * j + 4 * i + k] = midstate[i];
-
-   HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
-	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
-
-   scrypt_core_3way(X + 0 * 96, V, N);
-	scrypt_core_3way(X + 1 * 96, V, N);
-	scrypt_core_3way(X + 2 * 96, V, N);
-	scrypt_core_3way(X + 3 * 96, V, N);
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
-
-   PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-
-   for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
-
-   return 1;
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#endif /* HAVE_SCRYPT_3WAY */
-
-#ifdef HAVE_SCRYPT_6WAY
-static int scrypt_1024_1_1_256_24way( const uint32_t *input,
-                               uint32_t *output, uint32_t *midstate,
-                               unsigned char *scratchpad, int N, int thrid )
-{
-	uint32_t _ALIGN(128) tstate[24 * 8];
-	uint32_t _ALIGN(128) ostate[24 * 8];
-	uint32_t _ALIGN(128) W[24 * 32];
-	uint32_t _ALIGN(128) X[24 * 32];
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)( ( (uintptr_t)(scratchpad) + 63 ) & ~ (uintptr_t)(63) );
-	
-	for ( j = 0; j < 3; j++ ) 
-		for ( i = 0; i < 20; i++ )
-			for ( k = 0; k < 8; k++ )
-				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
-
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 8; i++ )
-			for ( k = 0; k < 8; k++ )
-				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
-
-   HMAC_SHA256_80_init_8way( W +   0, tstate +   0, ostate +   0 );
-	HMAC_SHA256_80_init_8way( W + 256, tstate +  64, ostate +  64 );
-	HMAC_SHA256_80_init_8way( W + 512, tstate + 128, ostate + 128 );
-
-   if ( work_restart[thrid].restart ) return 0;
-   
-   PBKDF2_SHA256_80_128_8way( tstate +   0, ostate +   0, W +   0, W +   0 );
-	PBKDF2_SHA256_80_128_8way( tstate +  64, ostate +  64, W + 256, W + 256 );
-	PBKDF2_SHA256_80_128_8way( tstate + 128, ostate + 128, W + 512, W + 512 );
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 32; i++ )
-			for ( k = 0; k < 8; k++ )
-				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-
-   scrypt_core_6way( X +  0 * 32, V, N );
-	scrypt_core_6way( X +  6 * 32, V, N );
-
-   if ( work_restart[thrid].restart ) return 0;
-
-   scrypt_core_6way( X + 12 * 32, V, N );
-	scrypt_core_6way( X + 18 * 32, V, N );
-
-   if ( work_restart[thrid].restart ) return 0;
-   
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 32; i++ )
-			for ( k = 0; k < 8; k++ )
-				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
-
-   PBKDF2_SHA256_128_32_8way( tstate +   0, ostate +   0, W +   0, W +   0 );
-	PBKDF2_SHA256_128_32_8way( tstate +  64, ostate +  64, W + 256, W + 256 );
-	PBKDF2_SHA256_128_32_8way( tstate + 128, ostate + 128, W + 512, W + 512 );
-
-   for ( j = 0; j < 3; j++ )
-		for ( i = 0; i < 8; i++ )
-			for ( k = 0; k < 8; k++ )
-				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
-
-   return 1;
-}
-#endif /* HAVE_SCRYPT_6WAY */
+#endif // SHA
 
 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
@@ -660,67 +1250,58 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-	int throughput = scrypt_best_throughput();
+   int thr_id = mythr->id;  
+   int throughput = scrypt_throughput;
 	int i;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-#ifdef HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		throughput *= 4;
-#endif
-	
-//   applog(LOG_INFO,"Scrypt thoughput %d",throughput);
+	for ( i = 0; i < throughput; i++ )
+		memcpy( data + i * 20, pdata, 80 );
+
+   sha256_transform_le( midstate, data, sha256_initial_state );
 
-	for (i = 0; i < throughput; i++)
-		memcpy(data + i * 20, pdata, 80);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-	
 	do {
       bool rc = true;
-		for (i = 0; i < throughput; i++)
-			data[i * 20 + 19] = ++n;
-		
-#if defined(HAVE_SHA256_4WAY)
-		if (throughput == 4)
-			rc = scrypt_1024_1_1_256_4way(data, hash, midstate,
-                             scratchbuf, scratchbuf_size, thr_id );
-		else
+		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+      if ( throughput == 16 )
+         rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
+                                      opt_param_n, thr_id );
+      else
 #endif
-#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
-		if (throughput == 12)
-			rc = scrypt_1024_1_1_256_12way(data, hash, midstate,
-                              scratchbuf, scratchbuf_size, thr_id );
-		else
+#if defined(__AVX2__)      
+      if ( throughput == 8 )      
+         rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
+                                     opt_param_n, thr_id );
+      else
 #endif
-#if defined(HAVE_SCRYPT_6WAY)
-		if (throughput == 24)
-			rc = scrypt_1024_1_1_256_24way(data, hash, midstate,
-                               scratchbuf, scratchbuf_size, thr_id );
-		else
+      if ( throughput == 4 )
+#if defined(__SHA__)
+         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
+                                         opt_param_n, thr_id );
+#else
+         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
+                                     opt_param_n, thr_id );
 #endif
-#if defined(HAVE_SCRYPT_3WAY)
-		if (throughput == 3)
-			rc = scrypt_1024_1_1_256_3way(data, hash, midstate,
-                                scratchbuf, scratchbuf_size, thr_id );
-		else
-#endif
-		rc = scrypt_1024_1_1_256(data, hash, midstate, scratchbuf,
-                                scratchbuf_size, thr_id );
-		
+      else
+         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
+                                opt_param_n, thr_id );
+
       if ( rc )
       for ( i = 0; i < throughput; i++ )
       {
-         if ( unlikely( valid_hash( hash + i * 8, ptarget ) ) )
+         if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
          {
-				pdata[19] = data[i * 20 + 19];
+//            applog( LOG_INFO, "Thread %d, Lane %d", thr_id,i );
+            pdata[19] = data[i * 20 + 19];
             submit_solution( work, hash + i * 8, mythr );
-			}
+         }
 
       }
-	} while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
+
+
+   } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19];
 	pdata[19] = n;
@@ -729,28 +1310,51 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 
 bool scrypt_miner_thread_init( int thr_id )
 {
- scratchbuf = scrypt_buffer_alloc( scratchbuf_size );  
- if ( scratchbuf )
-   return true;
- applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
- return false; 
+   scratchbuf = _mm_malloc( scratchbuf_size, 128 );
+   if ( scratchbuf )
+      return true;
+   applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
+   return false; 
 }
 
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
-  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
-  gate->scanhash         = (void*)&scanhash_scrypt;
-  opt_target_factor = 65536.0;
+#if defined(__SHA__)
+   gate->optimizations = SSE2_OPT | SHA_OPT;
+#else
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#endif
+   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
+   gate->scanhash         = (void*)&scanhash_scrypt;
+   opt_target_factor = 65536.0;
+   opt_param_n = opt_param_n ? opt_param_n : 1024;
+   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
 
-  if ( !opt_param_n )
-  {
-     opt_param_n = 1024;
-     scratchbuf_size = 1024;
-  }
-  else
-     scratchbuf_size = opt_param_n;
-  applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n );
-  return true;
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   scrypt_throughput = 16;
+   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+#elif defined(__SHA__)
+   scrypt_throughput = 4;
+   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+#elif defined(__AVX2__)
+   scrypt_throughput = 8;   
+   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+#else
+   scrypt_throughput = 4;
+   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+#endif
+
+   char t_units[4] = {0};
+   char d_units[4] = {0};
+   double t_size = (double)scratchbuf_size;
+   double d_size = (double)scratchbuf_size * opt_n_threads;
+
+   format_number_si( &t_size, t_units );
+   format_number_si( &d_size, d_units );
+   
+   applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
+          scrypt_throughput, t_size, t_units, d_size, d_units );
+
+   return true;
 };
 
diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c
index e09a4c2..2cdf9c8 100644
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,10 +39,10 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
-   sph_sha256_context ctx;
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, in, len );
-   sph_sha256_close( &ctx, digest );
+   sha256_context ctx;
+   sha256_ctx_init( &ctx );
+   sha256_update( &ctx, in, len );
+   sha256_final( &ctx, digest );
 }
 
 /**
@@ -64,7 +64,7 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
 void
 HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 {
-   unsigned char pad[64];
+   unsigned char pad[64] __attribute__ ((aligned (64)));
    unsigned char khash[32];
    const unsigned char * K = _K;
    size_t i;
@@ -72,29 +72,28 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
    /* If Klen > 64, the key is really SHA256(K). */
    if ( Klen > 64 )
    {
-      sph_sha256_init( &ctx->ictx );
-      sph_sha256( &ctx->ictx, K, Klen );
-      sph_sha256_close( &ctx->ictx, khash );
-
+      sha256_ctx_init( &ctx->ictx );
+      sha256_update( &ctx->ictx, K, Klen );
+      sha256_final( &ctx->ictx, khash );
       K = khash;
       Klen = 32;
    }
 
    /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sph_sha256_init( &ctx->ictx );
+   sha256_ctx_init( &ctx->ictx );
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;
 
    memset( pad + Klen, 0x36, 64 - Klen );
-   sph_sha256( &ctx->ictx, pad, 64 );
+   sha256_update( &ctx->ictx, pad, 64 );
 
    /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-   sph_sha256_init( &ctx->octx );
+   sha256_ctx_init( &ctx->octx );
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;
 
    memset( pad + Klen, 0x5c, 64 - Klen );
-   sph_sha256( &ctx->octx, pad, 64 );
+   sha256_update( &ctx->octx, pad, 64 );
 }
 
 /* Add bytes to the HMAC-SHA256 operation. */
@@ -102,18 +101,17 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-   sph_sha256( &ctx->ictx, in, len );
+   sha256_update( &ctx->ictx, in, len );
 }
 
 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
+HMAC_SHA256_Final( void *digest, HMAC_SHA256_CTX *ctx )
 {
-   unsigned char ihash[32];
-
-   sph_sha256_close( &ctx->ictx, ihash );
-   sph_sha256( &ctx->octx, ihash, 32 );
-   sph_sha256_close( &ctx->octx, digest );
+   uint32_t ihash[8] __attribute__ ((aligned (32)));
+   sha256_final( &ctx->ictx, ihash );
+   sha256_update( &ctx->octx, ihash, 32 );
+   sha256_final( &ctx->octx, digest );
 }
 
 /**
@@ -126,8 +124,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
                size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
 {
 	HMAC_SHA256_CTX PShctx, hctx;
-	uint8_t _ALIGN(128) T[32];
-	uint8_t _ALIGN(128) U[32];
+   uint64_t _ALIGN(128) T[4];
+   uint64_t _ALIGN(128) U[4];
+//   uint8_t _ALIGN(128) T[32];
+//	uint8_t _ALIGN(128) U[32];
    uint32_t ivec;
 	size_t i, clen;
 	uint64_t j;
@@ -163,10 +163,10 @@ PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
 //         _mm_xor_si128( ((__m128i*)T)[0], ((__m128i*)U)[0] );
 //         _mm_xor_si128( ((__m128i*)T)[1], ((__m128i*)U)[1] );
 
-//         for ( k = 0; k < 4; k++ )  T[k] ^= U[k];
+         for ( k = 0; k < 4; k++ )  T[k] ^= U[k];
          
-         for ( k = 0; k < 32; k++ )
-				T[k] ^= U[k];
+//         for ( k = 0; k < 32; k++ )
+//				T[k] ^= U[k];
 		}
 
 		/* Copy as many bytes as necessary into buf. */
diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h
index a735c53..7a281df 100644
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -31,18 +31,18 @@
 
 #include <sys/types.h>
 #include <stdint.h>
-#include "sph_sha2.h"
+#include "sha256-hash.h"
 
 typedef struct HMAC_SHA256Context
 {
-   sph_sha256_context ictx;
-   sph_sha256_context octx;
+   sha256_context ictx;
+   sha256_context octx;
 } HMAC_SHA256_CTX;
 
 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
 void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
 void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
-void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
+void HMAC_SHA256_Final( void*, HMAC_SHA256_CTX * );
 void HMAC_SHA256_Buf( const void *, size_t Klen, const void *,
                       size_t len, uint8_t digest[32] );
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index f9505d1..7b6618c 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -59,7 +59,9 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                          size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform( __m128i *state_out,  const __m128i *data,
+void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
+                            const __m128i *state_in );
+void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
 
 #endif  // SSE2
@@ -79,8 +81,10 @@ void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 void sha256_8way_full( void *dst, const void *data, size_t len );
-void sha256_8way_transform( __m256i *state_out, const __m256i *data,
-                            const __m256i *state_in );
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in );
 
 #endif  // AVX2
 
@@ -99,7 +103,9 @@ void sha256_16way_init( sha256_16way_context *sc );
 void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
 void sha256_16way_close( sha256_16way_context *sc, void *dst );
 void sha256_16way_full( void *dst, const void *data, size_t len );
-void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                             const __m512i *state_in );
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in );
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 7eb4067..2a229bf 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -180,6 +180,7 @@ static const uint32_t sha256d_hash1[16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000100
 };
 
+// this performs the entire hash all over again, why?
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
@@ -195,6 +196,7 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 		hash[i] = swab32(hash[i]);
 }
 
+/*
 #if defined (__SHA__)
 
 #include "algo/sha/sph_sha2.h"
@@ -241,6 +243,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len)
 }
 
 #endif
+*/
 
 static inline void sha256d_preextend(uint32_t *W)
 {
@@ -653,6 +656,7 @@ int scanhash_sha256d( struct work *work,
 	return 0;
 }
 
+/*
 int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -682,13 +686,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
    pdata[19] = n;
    return 0;
 }
-
+*/
 
 bool register_sha256d_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AVX2_OPT;
    gate->scanhash = (void*)&scanhash_sha256d;
-   gate->hash     = (void*)&sha256d;
+//   gate->hash     = (void*)&sha256d;
    return true;
 };
 
diff --git a/algo/sha/sha256-hash-2way-ni.c b/algo/sha/sha256-hash-2way-ni.c
index f169b63..7fc64ca 100644
--- a/algo/sha/sha256-hash-2way-ni.c
+++ b/algo/sha/sha256-hash-2way-ni.c
@@ -7,9 +7,9 @@
 
 #if defined(__SHA__)
 
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
-void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
+void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
                               const void *msg_X, const void *msg_Y,
                               const uint32_t *in_X, const uint32_t *in_Y )
 {
@@ -342,4 +342,348 @@ void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
     _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
 }
 
+void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
+
+    // Load initial values
+    TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
+    STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
+    TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
+    STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
+    TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
+    STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
+    STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
+    STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
+    STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE_X = STATE0_X;
+    ABEF_SAVE_Y = STATE0_Y;
+    CDGH_SAVE_X = STATE1_X;
+    CDGH_SAVE_Y = STATE1_Y;
+
+    // Rounds 0-3
+    TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
+    TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
+    TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
+    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
+    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 4-7
+    TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
+    TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
+    TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
+    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
+    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 8-11
+    TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
+    TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
+    TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
+    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
+    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 12-15
+    TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
+    TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
+    TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
+    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
+    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 16-19
+    TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 20-23
+    TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 24-27
+    TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 28-31
+    TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 32-35
+    TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 36-39
+    TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
+    TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
+
+    // Rounds 40-43
+    TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
+    TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
+
+    // Rounds 44-47
+    TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
+    TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
+    TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
+    TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
+    TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
+    TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
+
+    // Rounds 48-51
+    TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
+    MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
+    TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
+    TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
+    TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
+    TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+    TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
+    TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
+
+    // Rounds 52-55
+    TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
+    MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
+    MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
+    TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
+    TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
+    TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 56-59
+    TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
+    MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
+    TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
+    TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
+    TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
+    TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Rounds 60-63
+    TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
+    MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
+    MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
+    STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
+    STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
+    MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
+    MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
+    STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
+    STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
+
+    // Add values back to state
+    STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
+    STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
+    STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
+    STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
+
+    TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
+    TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
+    STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
+    STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
+    STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
+    STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
+    STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
+    STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &out_X[0], STATE0_X);
+    _mm_store_si128((__m128i*) &out_X[4], STATE1_X);
+    _mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
+    _mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
+}
+
+
 #endif
+
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index c5f6048..beac702 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -74,17 +74,6 @@ static const uint32_t K256[64] =
 #define CHs(X, Y, Z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
 
-/*
-#define MAJs(X, Y, Z) \
-   _mm_or_si128( _mm_and_si128( X, Y ), \
-                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
-*/
-/*
-#define MAJs(X, Y, Z) \
-  _mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
-                                   _mm_xor_si128( Y, Z ) ) )
-*/
-
 #define MAJs(X, Y, Z) \
   _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
                                    Y_xor_Z ) )
@@ -105,38 +94,6 @@ static const uint32_t K256[64] =
    _mm_xor_si128( _mm_xor_si128( \
         mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 
-/*
-#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
-do { \
-  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
-  __m128i T1 = mm128_ror_32( E, 14 ); \
-  __m128i T2 = mm128_ror_32( A,  9 ); \
-  __m128i T3 = _mm_xor_si128( F, G ); \
-  __m128i T4 = _mm_or_si128( A, B ); \
-  __m128i T5 = _mm_and_si128( A, B ); \
-  K  = _mm_add_epi32( K, W[i] ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T3 = _mm_and_si128( T3, E ); \
-  T4 = _mm_and_si128( T4, C ); \
-  K  = _mm_add_epi32( H, K ); \
-  T1 = mm128_ror_32( T1,  5 ); \
-  T2 = mm128_ror_32( T2, 11 ); \
-  T3 = _mm_xor_si128( T3, G ); \
-  T4 = _mm_or_si128( T4, T5 ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T1 = mm128_ror_32( T1,  6 ); \
-  T2 = mm128_ror_32( T2,  2 ); \
-  T1 = _mm_add_epi32( T1, T3 ); \
-  T2 = _mm_add_epi32( T2, T4 ); \
-  T1 = _mm_add_epi32( T1, K ); \
-  H  = _mm_add_epi32( T1, T2 ); \
-  D  = _mm_add_epi32( D, T1 ); \
-} while (0)
-*/
-
-
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i T1, T2; \
@@ -149,8 +106,8 @@ do { \
   H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 
-
-void sha256_4way_transform( __m128i *state_out, const __m128i *data,
+// LE data, no need to byte swap
+void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
                             const __m128i *state_in )
 {
    __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
@@ -232,6 +189,91 @@ void sha256_4way_transform( __m128i *state_out, const __m128i *data,
    state_out[7] = _mm_add_epi32( state_in[7], H );
 }
 
+// BE data, need to byte swap
+void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
+                            const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i W[16];
+
+   mm128_block_bswap_32( W, data );
+   mm128_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+   Y_xor_Z = _mm_xor_si128( B, C );
+
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+}
+
+
 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
@@ -436,61 +478,81 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
 
 // SHA-256 8 way
 
-#if defined(__AVX512VL__)
-
-#define CHx(X, Y, Z) \
-   _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
-
-#define MAJx(X, Y, Z) \
-   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
-
 #define BSG2_0x(x) \
-   mm256_xor3( mm256_ror_32(x,  2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  2 ), \
+                                       mm256_ror_32( x, 13 ) ), \
+                                       mm256_ror_32( x, 22 ) )
 
 #define BSG2_1x(x) \
-   mm256_xor3( mm256_ror_32(x,  6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  6 ), \
+                                       mm256_ror_32( x, 11 ) ), \
+                                       mm256_ror_32( x, 25 ) )
 
 #define SSG2_0x(x) \
-   mm256_xor3( mm256_ror_32(x,  7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  7 ), \
+                                       mm256_ror_32( x, 18 ) ), \
+                                       _mm256_srli_epi32( x, 3 ) ) 
 
 #define SSG2_1x(x) \
-   mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
+   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
+                                       mm256_ror_32( x, 19 ) ), \
+                                       _mm256_srli_epi32( x, 10 ) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+
+// With AVX512VL ternary logic optimizations are available.
+// If not optimize by forwarding the result of X^Y in MAJ to the next round
+// to avoid recalculating it as Y^Z. This optimization is not applicable
+// when MAJ is optimized with ternary logic.
+
+#if defined(__AVX512VL__)
+
+#define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
+
+#define MAJx(X, Y, Z)   _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
+
+#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
 
 #else  // AVX2
 
 #define CHx(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
-#define MAJx(X, Y, Z) \
-  _mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
-                                         _mm256_xor_si256( Y, Z ) ) )
-/*
+// Use saved X_xor_Y from previous round, now called Y_xor_Z,
+// and save new X_xor_Y, for next round.
 #define MAJx(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
-*/
 
-#define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  2), mm256_ror_32(x, 13) ), mm256_ror_32( x, 22) )
-
-#define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  6), mm256_ror_32(x, 11) ), mm256_ror_32( x, 25) )
-
-#define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x,  7), mm256_ror_32(x, 18) ), _mm256_srli_epi32(x, 3) ) 
-
-#define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-       mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
-
-#endif   // AVX512 else AVX2
-
-#define SHA2x_MEXP( a, b, c, d ) \
-     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  Y_xor_Z = X_xor_Y; \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
 
+/*
 #define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m256i T1, T2; \
@@ -498,16 +560,23 @@ do { \
   T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
                                            K, W[i] ) ); \
   T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+  Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
+*/
 
-void sha256_8way_transform( __m256i *state_out, const __m256i *data,
+#endif   // AVX512VL else AVX2
+
+// accepts LE byte ordered data, skip the byte swap
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
                             const __m256i *state_in )
 {
    __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
    __m256i W[16];
-
    memcpy_256( W, data, 16 );
 
    A = state_in[0];
@@ -519,6 +588,101 @@ void sha256_8way_transform( __m256i *state_out, const __m256i *data,
    G = state_in[6];
    H = state_in[7];
 
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+}
+
+
+// Accepts BE byte ordered data, need to byte swap
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
+   __m256i W[16];
+
+   mm256_block_bswap_32( W  , data   );
+   mm256_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -587,6 +751,9 @@ static void
 sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
 {
    register  __m256i A, B, C, D, E, F, G, H;
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
    __m256i W[16];
 
    mm256_block_bswap_32( W  , in   );
@@ -615,6 +782,10 @@ sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
       H = m256_const1_64( 0x5BE0CD195BE0CD19 );
    }
 
+#if !defined(__AVX512VL__)
+   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+   
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
@@ -790,27 +961,44 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 
 // SHA-256 16 way
 
-#define CHx16(X, Y, Z) \
-   _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
+#define CHx16(X, Y, Z)    _mm512_ternarylogic_epi32( X, Y, Z, 0xca )
 
-#define MAJx16(X, Y, Z) \
-   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
+#define MAJx16(X, Y, Z)   _mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
 
-#define BSG2_0x16(x) \
-   mm512_xor3( mm512_ror_32(x,  2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
+#define BSG2_0x16(x)      mm512_xor3( _mm512_ror_epi32( x,  2 ), \
+                                      _mm512_ror_epi32( x, 13 ), \
+                                      _mm512_ror_epi32( x, 22 ) )
 
-#define BSG2_1x16(x) \
-   mm512_xor3( mm512_ror_32(x,  6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
+#define BSG2_1x16(x)      mm512_xor3( _mm512_ror_epi32( x,  6 ), \
+                                      _mm512_ror_epi32( x, 11 ), \
+                                      _mm512_ror_epi32( x, 25 ) )
 
-#define SSG2_0x16(x) \
-   mm512_xor3( mm512_ror_32(x,  7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
+#define SSG2_0x16(x)      mm512_xor3( _mm512_ror_epi32(  x,  7 ), \
+                                      _mm512_ror_epi32(  x, 18 ), \
+                                      _mm512_srli_epi32( x,  3 ) )
 
-#define SSG2_1x16(x) \
-   mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
+#define SSG2_1x16(x)      mm512_xor3( _mm512_ror_epi32(  x, 17 ), \
+                                      _mm512_ror_epi32(  x, 19 ), \
+                                      _mm512_srli_epi32( x, 10 ) )
 
 #define SHA2x16_MEXP( a, b, c, d ) \
      mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
 
+#define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
+do { \
+  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
+                                 W[ i ] ); \
+  __m512i T1 = BSG2_1x16( E ); \
+  __m512i T2 = BSG2_0x16( A ); \
+  T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
+  T1 = _mm512_add_epi32( T1, H ); \
+  T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
+  T1 = _mm512_add_epi32( T1, T0 ); \
+  D  = _mm512_add_epi32( D,  T1 ); \
+  H  = _mm512_add_epi32( T1, T2 ); \
+} while (0)
+   
+/*
 #define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m512i T1, T2; \
@@ -821,14 +1009,10 @@ do { \
   D  = _mm512_add_epi32( D,  T1 ); \
   H  = _mm512_add_epi32( T1, T2 ); \
 } while (0)
+*/
 
-// Tranform one 16 lane by 64 byte message block and update state.
-// Calling function is responsible for initializing the state, setting
-// correct byte order, counting bits and padding of the final block.
-// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
-// redundant byte swapping.
-//
-void sha256_16way_transform( __m512i *state_out, const __m512i *data,
+// accepts LE input data
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in )
 {
    __m512i A, B, C, D, E, F, G, H;
@@ -909,6 +1093,89 @@ void sha256_16way_transform( __m512i *state_out, const __m512i *data,
    state_out[7] = _mm512_add_epi32( state_in[7], H );
 }
 
+// Accepts BE input data, need to bswap
+void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+                                const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   mm512_block_bswap_32( W  , data   );
+   mm512_block_bswap_32( W+8, data+8 );
+
+   A = state_in[0];
+   B = state_in[1];
+   C = state_in[2];
+   D = state_in[3];
+   E = state_in[4];
+   F = state_in[5];
+   G = state_in[6];
+   H = state_in[7];
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+}
+
 // Aggresive prehashing
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in )
diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c
index 78bda65..e08dd60 100644
--- a/algo/sha/sha256-hash-opt.c
+++ b/algo/sha/sha256-hash-opt.c
@@ -7,9 +7,9 @@
 
 #if defined(__SHA__)
 
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
-void sha256_opt_transform( uint32_t *state_out, const void *input,
+void sha256_opt_transform_le( uint32_t *state_out, const void *input,
                            const uint32_t *state_in )
 {
     __m128i STATE0, STATE1;
@@ -197,4 +197,192 @@ void sha256_opt_transform( uint32_t *state_out, const void *input,
     _mm_store_si128((__m128i*) &state_out[4], STATE1);
 }
 
+
+void sha256_opt_transform_be( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state_in[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state_out[0], STATE0);
+    _mm_store_si128((__m128i*) &state_out[4], STATE1);
+}
+
 #endif
diff --git a/algo/sha/sha256-hash-opt.h b/algo/sha/sha256-hash-opt.h
deleted file mode 100644
index 9ceacf4..0000000
--- a/algo/sha/sha256-hash-opt.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef SHA2_HASH_OPT_H__
-#define SHA2_HASH_OPT_H__ 1
-
-#include <stddef.h>
-#include "simd-utils.h"
-
-#if defined(__SHA__)
-
-void sha256_opt_transform( uint32_t *state_out, const void *input,
-                           const uint32_t *state_in );
-
-// 2 way with interleaved instructions
-void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y );
-
-#endif
-#endif
diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c
new file mode 100644
index 0000000..ddbaacc
--- /dev/null
+++ b/algo/sha/sha256-hash.c
@@ -0,0 +1,142 @@
+#include "sha256-hash.h"
+
+static const uint32_t SHA256_IV[8] =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+/*
+static const uint8_t SHA256_PAD[64] =
+{
+   0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
+
+void sha256_ctx_init( sha256_context *ctx )
+{
+   memcpy( ctx->state, SHA256_IV, sizeof SHA256_IV );
+   ctx->count = 0;
+}
+
+void sha256_update( sha256_context *ctx, const void *data, size_t len )
+{
+   int ptr = ctx->count & 0x3f;
+   const uint8_t *src = data;
+
+   ctx->count += (uint64_t)len;
+
+   if ( len < 64 - ptr )
+   {
+      memcpy( ctx->buf + ptr, src, len );
+      return;
+   }
+
+   memcpy( ctx->buf + ptr, src, 64 - ptr );
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+   src += 64 - ptr;
+   len -= 64 - ptr;
+
+   while ( len >= 64 )
+   {
+      sha256_transform_be( ctx->state, (uint32_t*)src, ctx->state );
+      src += 64;
+      len -= 64;
+   }
+
+   memcpy( ctx->buf, src, len );
+}
+
+#if 0
+void sha256_final( sha256_context *ctx, uint32_t *hash )
+{
+   size_t r;
+
+
+   /* Figure out how many bytes we have buffered. */
+   r = ctx->count & 0x3f;
+//   r = ( ctx->count >> 3 ) & 0x3f;
+
+//printf("final: count= %d, r= %d\n", ctx->count, r );
+   
+   /* Pad to 56 mod 64, transforming if we finish a block en route. */
+   if ( r < 56 )
+   {
+      /* Pad to 56 mod 64. */
+      memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
+   }
+   else
+   {
+      /* Finish the current block and mix. */
+      memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
+      sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+//      SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+
+      /* The start of the final block is all zeroes. */
+      memset( &ctx->buf[0], 0, 56 );
+   }
+
+   /* Add the terminating bit-count. */
+   ctx->buf[56] = bswap_64( ctx->count << 3 );
+//   ctx->buf[56] = bswap_64( ctx->count );
+//   be64enc( &ctx->buf[56], ctx->count );
+
+   /* Mix in the final block. */
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+//   SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+
+   for ( int i = 0; i < 8; i++ )  hash[i] = bswap_32( ctx->state[i] );
+   
+//   for ( int i = 0; i < 8; i++ )  be32enc( hash + 4*i, ctx->state + i );
+
+/*
+//   be32enc_vect(digest, ctx->state, 4);
+//   be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
+   // Encode vector, two words at a time. 
+   do {
+      be32enc(&dst[0], src[0]);
+      be32enc(&dst[4], src[1]);
+      src += 2;
+      dst += 8;
+   } while (--len);
+*/
+
+}
+#endif
+
+void sha256_final( sha256_context *ctx, void *hash )
+{
+   int ptr = ctx->count & 0x3f;
+
+   ctx->buf[ ptr++ ] = 0x80;
+   
+   if ( ptr > 56 )
+   {
+      memset( ctx->buf + ptr, 0, 64 - ptr );
+      sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+      memset( ctx->buf, 0, 56 );
+   }
+   else
+      memset( ctx->buf + ptr, 0, 56 - ptr );
+
+   *(uint64_t*)(&ctx->buf[56]) = bswap_64( ctx->count << 3 );   
+
+   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
+
+   for ( int i = 0; i < 8; i++ )
+      ( (uint32_t*)hash )[i] = bswap_32( ctx->state[i] );
+}
+
+void sha256_full( void *hash, const void *data, size_t len )
+{
+   sha256_context ctx;
+   sha256_ctx_init( &ctx );
+   sha256_update( &ctx, data, len );
+   sha256_final( &ctx, hash );
+}
+
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
new file mode 100644
index 0000000..c6d61d8
--- /dev/null
+++ b/algo/sha/sha256-hash.h
@@ -0,0 +1,56 @@
+#ifndef SHA256_HASH_H__
+#define SHA256_HASH_H__ 1
+
+#include <stddef.h>
+#include "simd-utils.h"
+#include "cpuminer-config.h"
+#include "sph_sha2.h"
+
+
+// generic interface 
+
+typedef struct {
+   unsigned char buf[64];    /* first field, for alignment */
+   uint32_t state[8];
+   uint64_t count;
+} sha256_context __attribute__((aligned(64)));
+
+void sha256_full( void *hash, const void *data, size_t len );
+void sha256_update( sha256_context *ctx, const void *data, size_t len );
+void sha256_final( sha256_context *ctx, void *hash );
+void sha256_ctx_init( sha256_context *ctx );
+void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                          const uint32_t *state_in );
+void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                          const uint32_t *state_in );
+
+#if defined(__SHA__)
+
+void sha256_opt_transform_le( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+void sha256_opt_transform_be( uint32_t *state_out, const void *input,
+                           const uint32_t *state_in );
+
+// 2 way with interleaved instructions
+void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+// Select target
+// with SHA...
+#define sha256_transform_le sha256_opt_transform_le
+#define sha256_transform_be sha256_opt_transform_be
+
+#else
+
+// without SHA...
+#define sha256_transform_le sph_sha256_transform_le
+#define sha256_transform_be sph_sha256_transform_be
+
+#endif
+#endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index 9bbc5c8..fd3ae2f 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -14,6 +14,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    __m512i  hash32[8]    __attribute__ ((aligned (32)));
    __m512i  initstate[8] __attribute__ ((aligned (32)));
    __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate2[8] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    __m512i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -23,7 +24,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 16;
    uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19; 
+   __m512i *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
    const __m512i last_byte = m512_const1_32( 0x80000000 );
@@ -45,27 +46,30 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
-   // hash first 64 bytes of data
-   sha256_16way_transform( midstate, vdata, initstate );
+   // hash first 64 byte block of data
+   sha256_16way_transform_le( midstate, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
 
    do
    {
       // 1. final 16 bytes of data, with padding
       memcpy_512( block, vdata + 16, 4 );
       block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );  
+      memset_zero_512( block + 5, 10 );
       block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_transform( hash32, block, midstate );
+      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_512( block + 9, 6 );
       block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );    
+      mm512_block_bswap_32( hash32, hash32 );
 
       for ( int lane = 0; lane < 16; lane++ )
       if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -85,7 +89,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
-
 #endif
 
 #if defined(SHA256D_8WAY)
@@ -128,7 +131,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_8way_transform( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -137,14 +140,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform( hash32, block, midstate );
+      sha256_8way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_256( block + 9, 6 );
       block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm256_block_bswap_32( hash32, hash32 );
@@ -209,7 +212,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -218,14 +221,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_128( block + 5, 10 );
       block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform( hash32, block, midstate );
+      sha256_4way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_128( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_128( block + 9, 6 );
       block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm128_block_bswap_32( hash32, hash32 );
diff --git a/algo/sha/sha256d.c b/algo/sha/sha256d.c
new file mode 100644
index 0000000..ed4bd60
--- /dev/null
+++ b/algo/sha/sha256d.c
@@ -0,0 +1,8 @@
+#include "sha256d.h"
+
+void sha256d( void *hash, const void *data, int len )
+{
+   sha256_full( hash, data, len );
+   sha256_full( hash, hash,  32 );
+}
+
diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h
new file mode 100644
index 0000000..71f78ee
--- /dev/null
+++ b/algo/sha/sha256d.h
@@ -0,0 +1,7 @@
+#include "algo-gate-api.h"
+#include <string.h>
+#include <inttypes.h>
+#include "sha256-hash.h"
+
+void sha256d( void *hash, const void *data, int len );
+
diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c
index cf9890e..90a2b7b 100644
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -3,14 +3,14 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
-static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
+static __thread sha256_context sha256q_ctx __attribute__ ((aligned (64)));
 
 void sha256q_midstate( const void* input )
 {
-   sph_sha256_init( &sha256q_ctx );
-   sph_sha256( &sha256q_ctx, input, 64 );
+   sha256_ctx_init( &sha256q_ctx );
+   sha256_update( &sha256q_ctx, input, 64 );
 }
 
 int sha256q_hash( void* output, const void* input )
@@ -19,24 +19,16 @@ int sha256q_hash( void* output, const void* input )
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16
 
-   sph_sha256_context ctx __attribute__ ((aligned (64)));
+   sha256_context ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
 
-   sph_sha256( &ctx, input + midlen, tail );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, output );
+   sha256_update( &ctx, input + midlen, tail );
+   sha256_final( &ctx, hash );
 
+   sha256_full( hash,   hash, 32 );
+   sha256_full( hash,   hash, 32 );
+   sha256_full( output, hash, 32 );
+   
    return 1;
 }
 
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 0f4fb58..12cbcde 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -47,7 +47,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 byte block of data
-   sha256_16way_transform( midstate, vdata, initstate );
+   sha256_16way_transform_le( midstate, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
    sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
@@ -60,18 +60,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
       memset_zero_512( block + 5, 10 );  
       block[15] = m512_const1_32( 80*8 ); // bit count
       sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
-//      sha256_16way_transform( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_512( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_512( block + 9, 6 );
       block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_512( block, hash32, 8 );
-      sha256_16way_transform( hash32, block, initstate );
+      sha256_16way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm512_block_bswap_32( hash32, hash32 );    
@@ -137,7 +136,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_8way_transform( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -146,18 +145,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_256( block + 5, 10 );
       block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform( hash32, block, midstate );
+      sha256_8way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_256( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_256( block + 9, 6 );
       block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_256( block, hash32, 8 );
-      sha256_8way_transform( hash32, block, initstate );
+      sha256_8way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm256_block_bswap_32( hash32, hash32 );
@@ -222,7 +221,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate, vdata, initstate );
 
    do
    {
@@ -231,18 +230,18 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
       block[ 4] = last_byte;
       memset_zero_128( block + 5, 10 );
       block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform( hash32, block, midstate );
+      sha256_4way_transform_le( hash32, block, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy_128( block, hash32, 8 );
       block[ 8] = last_byte;
       memset_zero_128( block + 9, 6 );
       block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy_128( block, hash32, 8 );
-      sha256_4way_transform( hash32, block, initstate );
+      sha256_4way_transform_le( hash32, block, initstate );
 
       // byte swap final hash for testing
       mm128_block_bswap_32( hash32, hash32 );
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index 90d2754..c528d27 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -4,120 +4,12 @@
 #include <string.h>
 #include <stdio.h>
 //#include "algo/sha/sph_sha2.h"
-#include "sha256-hash-opt.h"
+#include "sha256-hash.h"
 
 #if defined(__SHA__)
 
 // Only used on CPUs with SHA
 
-/*
-static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
-
-void sha256t_midstate( const void* input )
-{
-   sph_sha256_init( &sha256t_ctx );
-   sph_sha256( &sha256t_ctx, input, 64 );
-}
-
-int sha256t_hash( void* output, const void* input )
-{
-   uint32_t _ALIGN(64) hash[16];
-   const int midlen = 64;            // bytes
-   const int tail   = 80 - midlen;   // 16
-
-   sph_sha256_context ctx __attribute__ ((aligned (64)));
-   memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
-
-   sph_sha256( &ctx, input + midlen, tail );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, hash );
-
-   sph_sha256_init( &ctx );
-   sph_sha256( &ctx, hash, 32 );
-   sph_sha256_close( &ctx, output );
-
-   return 1;
-}
-*/
-
-/*
-int scanhash_sha256t( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t block[16]    __attribute__ ((aligned (64)));
-   uint32_t hash32[8]    __attribute__ ((aligned (32)));
-   uint32_t initstate[8] __attribute__ ((aligned (32)));
-   uint32_t midstate[8]  __attribute__ ((aligned (32)));
-
-
-
-//   uint32_t edata[20] __attribute__((aligned(64)));
-//   uint32_t hash[8] __attribute__((aligned(64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 1;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-//   mm128_bswap32_80( edata, pdata );
-//   sha256t_midstate( edata );
-
-   // initialize state
-   initstate[0] = 0x6A09E667;
-   initstate[1] = 0xBB67AE85;
-   initstate[2] = 0x3C6EF372;
-   initstate[3] = 0xA54FF53A;
-   initstate[4] = 0x510E527F;
-   initstate[5] = 0x9B05688C;
-   initstate[6] = 0x1F83D9AB;
-   initstate[7] = 0x5BE0CD19;
-
-   // hash first 64 bytes of data
-   sha256_opt_transform( midstate, pdata, initstate );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      memcpy( block, pdata + 16, 16 );
-      block[ 4] = 0x80000000;
-      memset( block + 5, 0, 40 );
-      block[15] = 80*8; // bit count
-      sha256_opt_transform( hash32, block, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy( block, hash32, 32 );
-      block[ 8] = 0x80000000;
-      memset( block + 9, 0, 24 );
-      block[15] = 32*8; // bit count
-      sha256_opt_transform( hash32, block, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy( block, hash32, 32 );
-      sha256_opt_transform( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      casti_m128i( hash32, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
-      casti_m128i( hash32, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
-
-      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
-         submit_solution( work, hash32, mythr );
-      n++;
-      pdata[19] = n;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
 
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
@@ -149,7 +41,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
    initstate[7] = 0x5BE0CD19;
 
    // hash first 64 bytes of data
-   sha256_opt_transform( midstate, pdata, initstate );
+   sha256_opt_transform_le( midstate, pdata, initstate );
 
    do
    {
@@ -162,7 +54,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
       memset( block0 + 5, 0, 40 );
       memset( block1 + 5, 0, 40 );
       block0[15] = block1[15] = 80*8; // bit count
-      sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
 
       // 2. 32 byte hash from 1.
       memcpy( block0, hash0, 32 );
@@ -171,12 +63,12 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
       memset( block0 + 9, 0, 24 );
       memset( block1 + 9, 0, 24 );
       block0[15] = block1[15] = 32*8; // bit count
-      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
 
       // 3. 32 byte hash from 2.
       memcpy( block0, hash0, 32 );
       memcpy( block1, hash1, 32 );
-      sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
 
       // byte swap final hash for testing
       casti_m128i( hash0, 0 ) =
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index e41a92b..7c96d2e 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -95,32 +95,36 @@ static const uint64_t K512[80] =
 
 // SHA-512 8 way 64 bit
 
-#define CH8W(X, Y, Z) \
-   _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
+#define CH8W( X, Y, Z )    _mm512_ternarylogic_epi64( X, Y, Z, 0xca )
 
-#define MAJ8W(X, Y, Z) \
-   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
+#define MAJ8W( X, Y, Z )   _mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
 
-#define BSG8W_5_0(x) \
-   mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
+#define BSG8W_5_0( x )     mm512_xor3( _mm512_ror_epi64( x, 28 ), \
+                                       _mm512_ror_epi64( x, 34 ), \
+                                       _mm512_ror_epi64( x, 39 ) )
 
-#define BSG8W_5_1(x) \
-   mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
+#define BSG8W_5_1( x )     mm512_xor3( _mm512_ror_epi64( x, 14 ), \
+                                       _mm512_ror_epi64( x, 18 ), \
+                                       _mm512_ror_epi64( x, 41 ) )
 
-#define SSG8W_5_0(x) \
-   mm512_xor3( mm512_ror_64(x,  1), mm512_ror_64(x,  8), _mm512_srli_epi64(x, 7) ) 
+#define SSG8W_5_0( x )     mm512_xor3( _mm512_ror_epi64( x,  1 ), \
+                                       _mm512_ror_epi64( x,  8 ), \
+                                       _mm512_srli_epi64( x, 7 ) ) 
 
-#define SSG8W_5_1(x) \
-   mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
+#define SSG8W_5_1( x )     mm512_xor3( _mm512_ror_epi64( x, 19 ), \
+                                       _mm512_ror_epi64( x, 61 ), \
+                                       _mm512_srli_epi64( x, 6 ) )
 
-#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m512i T1, T2; \
-  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
-  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
-  D  = _mm512_add_epi64( D, T1 ); \
+  __m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
+  __m512i T1 = BSG8W_5_1( E ); \
+  __m512i T2 = BSG8W_5_0( A ); \
+  T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
+  T1 = _mm512_add_epi64( T1, H ); \
+  T2 = _mm512_add_epi64( T2, MAJ8W( A, B, C ) ); \
+  T1 = _mm512_add_epi64( T1, T0 ); \
+  D  = _mm512_add_epi64( D,  T1 ); \
   H  = _mm512_add_epi64( T1, T2 ); \
 } while (0)
 
@@ -267,16 +271,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 
 // SHA-512 4 way 64 bit
 
-
 #define CH(X, Y, Z) \
    _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
 
-/*
-#define MAJ(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-*/
-
 #define MAJ(X, Y, Z) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
@@ -289,15 +286,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
   mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
                    _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
 
-/*
-#define BSG5_0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
-
-#define BSG5_1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
-*/
 /*
 #define SSG5_0(x) \
    _mm256_xor_si256( _mm256_xor_si256( \
@@ -325,94 +313,20 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
    return _mm256_add_epi64( w0a, w1a );
 }
 
-/*
-#define SSG512x2_0( w0, w1, i ) do \
-{ \
-   __m256i X0a, X1a, X0b, X1b; \
-  X0a = mm256_ror_64( W[i-15], 1 ); \
-  X1a = mm256_ror_64( W[i-14], 1 ); \
-  X0b = mm256_ror_64( W[i-15], 8 ); \
-  X1b = mm256_ror_64( W[i-14], 8 ); \
-  X0a = _mm256_xor_si256( X0a, X0b ); \
-  X1a = _mm256_xor_si256( X1a, X1b ); \
-  X0b = _mm256_srli_epi64( W[i-15], 7 ); \
-  X1b = _mm256_srli_epi64( W[i-14], 7 ); \
-  w0  = _mm256_xor_si256( X0a, X0b ); \
-  w1  = _mm256_xor_si256( X1a, X1b ); \
-} while(0)
-
-#define SSG512x2_1( w0, w1, i ) do \
-{ \
-   __m256i X0a, X1a, X0b, X1b; \
-  X0a = mm256_ror_64( W[i-2],19 ); \
-  X1a = mm256_ror_64( W[i-1],19 ); \
-  X0b = mm256_ror_64( W[i-2],61 ); \
-  X1b = mm256_ror_64( W[i-1],61 ); \
-  X0a = _mm256_xor_si256( X0a, X0b ); \
-  X1a = _mm256_xor_si256( X1a, X1b ); \
-  X0b = _mm256_srli_epi64( W[i-2], 6 ); \
-  X1b = _mm256_srli_epi64( W[i-1], 6 ); \
-  w0  = _mm256_xor_si256( X0a, X0b ); \
-  w1  = _mm256_xor_si256( X1a, X1b ); \
-} while(0)
-*/
-/*
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
+#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
-  __m256i T1 = mm256_ror_64( E, 23 ); \
-  __m256i T2 = mm256_ror_64( A,  5 ); \
-  __m256i T3 = _mm256_xor_si256( F, G ); \
-  __m256i T4 = _mm256_or_si256( A, B ); \
-  __m256i T5 = _mm256_and_si256( A, B ); \
-  K = _mm256_add_epi64( K, W[i] ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T3 = _mm256_and_si256( T3, E ); \
-  T4 = _mm256_and_si256( T4, C ); \
-  K = _mm256_add_epi64( H, K ); \
-  T1 = mm256_ror_64( T1, 4 ); \
-  T2 = mm256_ror_64( T2, 6 ); \
-  T3 = _mm256_xor_si256( T3, G ); \
-  T4 = _mm256_or_si256( T4, T5 ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T1 = mm256_ror_64( T1, 14 ); \
-  T2 = mm256_ror_64( T2, 28 ); \
-  T1 = _mm256_add_epi64( T1, T3 ); \
-  T2 = _mm256_add_epi64( T2, T4 ); \
-  T1 = _mm256_add_epi64( T1, K ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-} while (0)
-*/
-/*
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
-  __m256i T1 = BSG5_1(E); \
-  __m256i T2 = BSG5_0(A); \
-  T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
-  T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-*/
-
-
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i T1, T2; \
-  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
-  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
+  __m256i T1 = BSG5_1( E ); \
+  __m256i T2 = BSG5_0( A ); \
+  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
+  T1 = _mm256_add_epi64( T1, H ); \
+  T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
+  T1 = _mm256_add_epi64( T1, T0 ); \
   Y_xor_Z = X_xor_Y; \
-  D  = _mm256_add_epi64( D, T1 ); \
+  D  = _mm256_add_epi64( D,  T1 ); \
   H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
 
-
 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index 7e39954..a89fc8d 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -71,198 +71,6 @@ static const sph_u32 H256[8] = {
  * of the compression function.
  */
 
-#if defined(__SHA__)
-
-#include "simd-utils.h"
-
-static void sha2_round( const uint8_t input[], uint32_t state[8] )
-{
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, MASK;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128((__m128i*) &state[0]);
-    STATE1 = _mm_load_si128((__m128i*) &state[4]);
-    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
-
-    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
-    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
-    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
-    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    MSG = _mm_load_si128((const __m128i*) (input+0));
-    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
-    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
-    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
-    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
-    TMSG0 = _mm_add_epi32(TMSG0, TMP);
-    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
-    TMSG1 = _mm_add_epi32(TMSG1, TMP);
-    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
-    TMSG2 = _mm_add_epi32(TMSG2, TMP);
-    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
-    TMSG3 = _mm_add_epi32(TMSG3, TMP);
-    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
-    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
-    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
-    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
-
-    // Save state
-    _mm_store_si128((__m128i*) &state[0], STATE0);
-    _mm_store_si128((__m128i*) &state[4], STATE1);
-}
-
-#else   // no SHA
 
 /*
 static const sph_u32 K[64] = {
@@ -875,8 +683,24 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
 #undef SHA2_IN
 }
 
-#endif   // SHA else
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in )
+{
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   (data[x])
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+}
 
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data, 
+                              const uint32_t *state_in )
+{  
+memcpy( state_out, state_in, 32 );
+#define SHA2_IN(x)   sph_dec32be_aligned( data+(x) )
+   SHA2_ROUND_BODY( SHA2_IN, state_out );
+#undef SHA2_IN
+
+}
 
 /* see sph_sha2.h */
 void
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index e3a83eb..b76c3f4 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -207,6 +207,13 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
 
 void sph_sha256_full( void *dst, const void *data, size_t len );
 
+// These shouldn't be called directly, use sha256-hash.h generic functions
+// sha256_transform_le & sha256_transform_be instead.
+void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
+
+void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
+                              const uint32_t *state_in );
 
 
 #if SPH_64
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index 1b77426..9c71459 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -20,8 +20,8 @@ static const uint32_t IV512[] =
 
 
 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror128_32( a ), \
-                       mm256_ror128_32( b ), 0x88 )
+   _mm256_blend_epi32( mm256_shuflr128_32( a ), \
+                       mm256_shuflr128_32( b ), 0x88 )
 
 #if defined(__VAES__)
 
@@ -78,7 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
    {
       // round 1, 5, 9
 
-     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
+     k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
                                   mm256_aesenc_2x128( k00, zero ) ) );
 
      if ( r == 0 )
@@ -88,7 +88,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
      k01 = _mm256_xor_si256( k00,
-		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
 
      if ( r == 1 )
         k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -97,25 +97,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
      k02 = _mm256_xor_si256( k01,
-		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k02, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
      k03 = _mm256_xor_si256( k02,
-		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k03, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p3 = _mm256_xor_si256( p3, x );
 
      k10 = _mm256_xor_si256( k03,
-		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k10, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
      k11 = _mm256_xor_si256( k10,
-		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k11, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
      k12 = _mm256_xor_si256( k11,
-		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) ) );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
      k13 = _mm256_xor_si256( k12,
-		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
 
      if ( r == 2 )
         k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -151,31 +151,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
      // round 3, 7, 11
 
-     k00 = _mm256_xor_si256( mm256_ror128_32(
+     k00 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k00, zero ) ), k13 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror128_32(
+     k01 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k01, zero ) ), k00 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror128_32(
+     k02 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k02, zero ) ), k01 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror128_32(
+     k03 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k03, zero ) ), k02 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
      p1 = _mm256_xor_si256( p1, x );
 
-     k10 = _mm256_xor_si256( mm256_ror128_32(
+     k10 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k10, zero ) ), k03 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror128_32(
+     k11 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k11, zero ) ), k10 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror128_32(
+     k12 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k12, zero ) ), k11 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror128_32(
+     k13 = _mm256_xor_si256( mm256_shuflr128_32(
                                      mm256_aesenc_2x128( k13, zero ) ), k12 );
      x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
@@ -209,35 +209,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 
    // round 13
 
-   k00 = _mm256_xor_si256( mm256_ror128_32(
+   k00 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror128_32(
+   k01 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror128_32(
+   k02 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror128_32(
+   k03 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
 
    p3 = _mm256_xor_si256( p3, x );
 
-   k10 = _mm256_xor_si256( mm256_ror128_32(
+   k10 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror128_32(
+   k11 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
 
-   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
    k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
 
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror128_32(
+   k13 = _mm256_xor_si256( mm256_shuflr128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
    x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
 
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
index 2c93df9..0184ee8 100644
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -12,8 +12,8 @@ static const uint32_t IV512[] =
 };
 
 #define mm512_ror2x512hi_1x32( a, b ) \
-   _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
-                                    mm512_ror128_32( b ) )
+   _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
+                                    mm512_shuflr128_32( b ) )
 
 static void
 c512_4way( shavite512_4way_context *ctx, const void *msg )
@@ -60,7 +60,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
    {
       // round 1, 5, 9
 
-     K0 = _mm512_xor_si512( K7, mm512_ror128_32(
+     K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
                                   _mm512_aesenc_epi128( K0, m512_zero ) ) );
 
      if ( r == 0 )
@@ -69,33 +69,33 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
      K1 = _mm512_xor_si512( K0,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
 
      if ( r == 1 )
-        K1 = _mm512_xor_si512( K1, mm512_ror128_32(
+        K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
                  _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
 
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
      K2 = _mm512_xor_si512( K1,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
      K3 = _mm512_xor_si512( K2,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P3 = _mm512_xor_si512( P3, X );
 
      K4 = _mm512_xor_si512( K3,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
      K5 = _mm512_xor_si512( K4,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
      K6 = _mm512_xor_si512( K5,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
      K7 = _mm512_xor_si512( K6,
-		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+		           mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
 
      if ( r == 2 )
         K7 = _mm512_xor_si512( K7, mm512_swap128_64(
@@ -130,31 +130,31 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
      // round 3, 7, 11
 
-     K0 = _mm512_xor_si512( mm512_ror128_32(
+     K0 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( mm512_ror128_32(
+     K1 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( mm512_ror128_32(
+     K2 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( mm512_ror128_32(
+     K3 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
      P1 = _mm512_xor_si512( P1, X );
 
-     K4 = _mm512_xor_si512( mm512_ror128_32(
+     K4 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( mm512_ror128_32(
+     K5 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( mm512_ror128_32(
+     K6 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( mm512_ror128_32(
+     K7 = _mm512_xor_si512( mm512_shuflr128_32(
                                _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
      X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
@@ -187,34 +187,34 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
 
    // round 13
 
-   K0 = _mm512_xor_si512( mm512_ror128_32(
+   K0 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
-   K1 = _mm512_xor_si512( mm512_ror128_32(
+   K1 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-   K2 = _mm512_xor_si512( mm512_ror128_32(
+   K2 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-   K3 = _mm512_xor_si512( mm512_ror128_32(
+   K3 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
 
    P3 = _mm512_xor_si512( P3, X );
 
-   K4 = _mm512_xor_si512( mm512_ror128_32(
+   K4 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
-   K5 = _mm512_xor_si512( mm512_ror128_32(
+   K5 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
 
-   K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
+   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
    K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
 
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-   K7= _mm512_xor_si512( mm512_ror128_32(
+   K7= _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
    X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
 
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index a593cf5..d8f6feb 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -74,15 +74,15 @@ static const sph_u32 IV512[] = {
 
 #endif
 
+/*
 #if defined(__AVX2__)
 // 2 way version of above
 // a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
-
 #define mm256_ror2x256hi_1x32( a, b ) \
    _mm256_blend_epi32( mm256_ror256_1x32( a ), \
                        mm256_rol256_3x32( b ), 0x88 )
-
 #endif
+*/
 
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -135,7 +135,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
    for ( r = 0; r < 3; r ++ )
    {
       // round 1, 5, 9
-      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
       k00 = _mm_xor_si128( k00, k13 ); 
 
       if ( r == 0 )
@@ -144,7 +144,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       x = _mm_xor_si128( p0, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
+      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
       k01 = _mm_xor_si128( k01, k00 );
 
       if ( r == 1 )
@@ -153,31 +153,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
       k02 = _mm_xor_si128( k02, k01 );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
       k03 = _mm_xor_si128( k03, k02 );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p3 = _mm_xor_si128( p3, x );
 
-      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
       k10 = _mm_xor_si128( k10, k03 );
 
       x = _mm_xor_si128( p2, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
       k11 = _mm_xor_si128( k11, k10 );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
       k12 = _mm_xor_si128( k12, k11 );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
       k13 = _mm_xor_si128( k13, k12 );
 
       if ( r == 2 )
@@ -222,38 +222,38 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
       // round 3, 7, 11
 
-      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
       k00 = _mm_xor_si128( k00, k13 );
       x = _mm_xor_si128( p2, k00 );
       x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) );
+      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
       k01 = _mm_xor_si128( k01, k00 );
       x = _mm_xor_si128( x, k01 );
       x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
       k02 = _mm_xor_si128( k02, k01 );
       x = _mm_xor_si128( x, k02 );
       x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
       k03 = _mm_xor_si128( k03, k02 );
       x = _mm_xor_si128( x, k03 );
       x = _mm_aesenc_si128( x, zero );
 
       p1 = _mm_xor_si128( p1, x );
 
-      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
       k10 = _mm_xor_si128( k10, k03 );
       x = _mm_xor_si128( p0, k10 );
       x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
       k11 = _mm_xor_si128( k11, k10 );
       x = _mm_xor_si128( x, k11 );
       x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
       k12 = _mm_xor_si128( k12, k11 );
       x = _mm_xor_si128( x, k12 );
       x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
       k13 = _mm_xor_si128( k13, k12 );
       x = _mm_xor_si128( x, k13 );
       x = _mm_aesenc_si128( x, zero );
@@ -295,39 +295,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
 
    // round 13
 
-   k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, zero ) );
+   k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
    k00 = _mm_xor_si128( k00, k13 );
    x = _mm_xor_si128( p0, k00 );
    x = _mm_aesenc_si128( x, zero );
-   k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, zero ) ); 
+   k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); 
    k01 = _mm_xor_si128( k01, k00 );
    x = _mm_xor_si128( x, k01 );
    x = _mm_aesenc_si128( x, zero );
-   k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, zero ) );
+   k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
    k02 = _mm_xor_si128( k02, k01 );
    x = _mm_xor_si128( x, k02 );
    x = _mm_aesenc_si128( x, zero );
-   k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, zero ) );
+   k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
    k03 = _mm_xor_si128( k03, k02 );
    x = _mm_xor_si128( x, k03 );
    x = _mm_aesenc_si128( x, zero );
 
    p3 = _mm_xor_si128( p3, x );
 
-   k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, zero ) );
+   k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
    k10 = _mm_xor_si128( k10, k03 );
    x = _mm_xor_si128( p2, k10 );
    x = _mm_aesenc_si128( x, zero );
-   k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, zero ) );
+   k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
    k11 = _mm_xor_si128( k11, k10 );
    x = _mm_xor_si128( x, k11 );
    x = _mm_aesenc_si128( x, zero );
-   k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, zero ) );
+   k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
    k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
                ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
    x = _mm_xor_si128( x, k12 );
    x = _mm_aesenc_si128( x, zero );
-   k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, zero ) );
+   k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
    k13 = _mm_xor_si128( k13, k12 );
    x = _mm_xor_si128( x, k13 );
    x = _mm_aesenc_si128( x, zero );
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index a12af43..5a7cdbd 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include "skein-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #if defined (SKEIN_8WAY)
 
@@ -87,7 +87,6 @@ void skeinhash_4way( void *state, const void *input )
      uint32_t hash1[16] __attribute__ ((aligned (64)));
      uint32_t hash2[16] __attribute__ ((aligned (64)));
      uint32_t hash3[16] __attribute__ ((aligned (64)));
-     sph_sha256_context ctx_sha256;
 #else
      uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
      sha256_4way_context ctx_sha256;
@@ -98,18 +97,12 @@ void skeinhash_4way( void *state, const void *input )
 #if defined(__SHA__)      
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash0, 64 );
-     sph_sha256_close( &ctx_sha256, hash0 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash1, 64 );
-     sph_sha256_close( &ctx_sha256, hash1 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash2, 64 );
-     sph_sha256_close( &ctx_sha256, hash2 );
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash3, 64 );
-     sph_sha256_close( &ctx_sha256, hash3 );
+
+     sha256_full( hash0, hash0, 64 );
+     sha256_full( hash1, hash1, 64 );
+     sha256_full( hash2, hash2, 64 );
+     sha256_full( hash3, hash3, 64 );
+    
      intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
 
 #else
diff --git a/algo/skein/skein.c b/algo/skein/skein.c
index 91eb325..be9bb82 100644
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -5,21 +5,18 @@
 #include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 void skeinhash(void *state, const void *input)
 {
      uint32_t hash[16] __attribute__ ((aligned (64)));
      sph_skein512_context ctx_skein;
-     sph_sha256_context   ctx_sha256;
 
      sph_skein512_init( &ctx_skein );
      sph_skein512( &ctx_skein, input, 80 );
      sph_skein512_close( &ctx_skein, hash );
 
-     sph_sha256_init( &ctx_sha256 );
-     sph_sha256( &ctx_sha256, hash, 64 );
-     sph_sha256_close( &ctx_sha256, hash );
+     sha256_full( hash, hash, 64 );
 
      memcpy(state, hash, 32);
 }
@@ -27,8 +24,8 @@ void skeinhash(void *state, const void *input)
 int scanhash_skein( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t hash64[8] __attribute__ ((aligned (64)));
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t Htarg = ptarget[7];
@@ -36,7 +33,7 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
 	uint32_t n = first_nonce;
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );
 
 	do {
 		be32enc(&endiandata[19], n); 
diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c
index 0d971f2..8880b45 100644
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -176,12 +176,6 @@ static void rotate_indexes( uint32_t *p )
 */
 }
 #endif
-
-static inline uint32_t rotl32( uint32_t a, size_t r )
-{
-   return ( a << r ) | ( a >> (32-r) );
-}
-
 // Vectorized and targetted version of fnv1a
 #if defined (__AVX2__)        
 
@@ -232,7 +226,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
 { \
    const uint32_t *blob_off = blob + \
-                 ( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
+                 ( ( fnv1a( rol32( subset[i], r ), accumulator ) % mdiv ) \
                  * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
    UPDATE_ACCUMULATOR; \
    MULXOR; \
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index a010344..ec808f6 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -1,5 +1,5 @@
 #include "algo-gate-api.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "Verthash.h"
 #include "tiny_sha3/sha3-4way.h"
 
@@ -140,7 +140,7 @@ bool register_verthash_algo( algo_gate_t* gate )
          uint8_t vhDataFileHash[32] = { 0 };
 
          applog( LOG_NOTICE, "Verifying Verthash data" );
-         sph_sha256_full( vhDataFileHash, verthashInfo.data,
+         sha256_full( vhDataFileHash, verthashInfo.data,
                           verthashInfo.dataSize );
          if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
                       sizeof(verthashDatFileHash_bytes) ) == 0 )
diff --git a/algo/whirlpool/whirlpool.c b/algo/whirlpool/whirlpool.c
index 59fcf71..1c6b688 100644
--- a/algo/whirlpool/whirlpool.c
+++ b/algo/whirlpool/whirlpool.c
@@ -82,7 +82,7 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n );
 		whirlpool_hash(vhash, endiandata);
 
-		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+		if (vhash[7] <= Htarg && fulltest(vhash, ptarget) & ! opt_benchmark )
              submit_solution( work, vhash, mythr );
 	} while ( n < max_nonce && !work_restart[thr_id].restart);
 
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 2973952..8d4fb05 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -52,10 +52,10 @@ void x16r_8way_prehash( void *vdata, void *pdata )
       break;
       case CUBEHASH:
          mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
+         cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
       break;
       case HAMSI:
          mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -207,15 +207,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
          case LUFFA:
             if ( i == 0 )
             {
-                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash, 
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               luffa_4way_update_close( &ctx.luffa, vhash,
+                                                    vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               luffa_4way_update_close( &ctx.luffa, vhash, 
+                                                    vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
             else
             {
@@ -230,56 +230,24 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
          case CUBEHASH:
             if ( i == 0 )
             {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                            (const byte*)in0 + 64, 16 );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               cube_4way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                            (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                            (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                            (const byte*)in3 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
-                                            (const byte*)in4 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
-                                            (const byte*)in5 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
-                                            (const byte*)in6 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
-                                            (const byte*)in7 + 64, 16 );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               cube_4way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<2), 16 );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
             else
             {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                             (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                             (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                             (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                             (const byte*)in3, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
-                                             (const byte*)in4, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
-                                             (const byte*)in5, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
-                                             (const byte*)in6, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
-                                             (const byte*)in7, size );
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
             }
          break;
          case SHAVITE:
@@ -556,9 +524,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
          break;
       case CUBEHASH:
          mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+         intrlv_2x128( vdata2, edata, edata, 640 );
+         cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
+         cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
+         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
       break;
       case HAMSI:
          mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -680,13 +649,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
          case LUFFA:
             if ( i == 0 )
             {
-               intrlv_2x128( vhash, hash0, hash1, 640 );
-               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
-               dintrlv_2x128_512( hash0, hash1, vhash );
-               intrlv_2x128( vhash, hash2, hash3, 640 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
-               dintrlv_2x128_512( hash2, hash3, vhash );
+              intrlv_2x128( vhash, hash0, hash1, 640 );
+              luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+              dintrlv_2x128_512( hash0, hash1, vhash );
+              intrlv_2x128( vhash, hash2, hash3, 640 );
+              memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+              luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+              dintrlv_2x128_512( hash2, hash3, vhash );
             }
             else
             {
@@ -701,32 +670,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
          case CUBEHASH:
             if ( i == 0 )
             {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0 + 64, 16 );
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               cube_2way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash0, hash1, vhash );
                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                          (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                          (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                          (const byte*)in3 + 64, 16 );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               cube_2way_update_close( &ctx.cube, vhash,
+                                                  vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash2, hash3, vhash );
             }
             else
             {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                     (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                     (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                     (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                     (const byte*)in3, size );
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               cube_2way_full( &ctx.cube, vhash, 512, vhash, size );
+               dintrlv_2x128_512( hash2, hash3, vhash );
             }
          break;
          case SHAVITE:
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 09315f6..3a94344 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,4 +1,5 @@
 #include "x16r-gate.h"
+#include "algo/sha/sha256d.h"
 
 __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
 
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 748b7fa..76ca5e7 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -37,6 +37,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -115,7 +116,7 @@ union _x16r_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cubehashParam           cube;
+    cube_4way_context       cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     hashState_fugue         fugue;
@@ -164,8 +165,8 @@ union _x16r_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
+    cube_2way_context       cube;
     hashState_luffa         luffa1;
-    cubehashParam           cube;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index de2dbe6..2f27116 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -13,7 +13,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 #if defined (X21S_8WAY)
@@ -208,9 +208,7 @@ union _x21s_4way_context_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(__SHA__)
-    sph_sha256_context      sha256;
-#else
+#if !defined(__SHA__)
     sha256_4way_context     sha256;
 #endif
 } __attribute__ ((aligned (64)));
@@ -275,18 +273,10 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
 
 #if defined(__SHA__)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
+   sha256_full( output,    hash0, 64 );
+   sha256_full( output+32, hash1, 64 );
+   sha256_full( output+64, hash2, 64 );
+   sha256_full( output+96, hash3, 64 );
 
 #else
 
diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c
index b81c07e..96782e2 100644
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
@@ -23,7 +23,7 @@ union _x21s_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
+        sha256_context      sha256;
 };
 typedef union _x21s_context_overlay x21s_context_overlay;
 
@@ -50,9 +50,7 @@ int x21s_hash( void* output, const void* input, int thrid )
    sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
    sph_gost512_close( &ctx.gost, (void*) hash );
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash, 64 );
-   sph_sha256_close( &ctx.sha256, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy( output, hash, 32 );
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index fcff0b6..1902a2d 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -37,7 +37,8 @@ union _x17_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+//    cube_4way_context       cube;
+    cube_4way_2buf_context   cube;
 #if defined(__VAES__)
     groestl512_4way_context groestl;
     shavite512_4way_context shavite;
@@ -119,8 +120,10 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
      luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
      luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+     cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
+     
+//     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+//     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
 
 #if defined(__VAES__)
 
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index 94b34cc..5acf3de 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -28,7 +28,7 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 #if defined(X22I_8WAY)
@@ -51,9 +51,7 @@ union _x22i_8way_ctx_overlay
     haval256_5_8way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(X22I_8WAY_SHA)
-    sph_sha256_context      sha256;
-#else
+#if !defined(X22I_8WAY_SHA)
     sha256_8way_context     sha256;
 #endif
 #if defined(__VAES__)
@@ -391,30 +389,14 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
 
 #if defined(X22I_8WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash4, 64 );
-   sph_sha256_close( &ctx.sha256, output+128 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash5, 64 );
-   sph_sha256_close( &ctx.sha256, output+160 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash6, 64 );
-   sph_sha256_close( &ctx.sha256, output+192 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash7, 64 );
-   sph_sha256_close( &ctx.sha256, output+224 );
+   sha256_full( hash0, hash0, 64 );
+   sha256_full( hash1, hash1, 64 );
+   sha256_full( hash2, hash2, 64 );
+   sha256_full( hash3, hash3, 64 );
+   sha256_full( hash4, hash4, 64 );
+   sha256_full( hash5, hash5, 64 );
+   sha256_full( hash6, hash6, 64 );
+   sha256_full( hash7, hash7, 64 );
    
 #else
 
@@ -551,9 +533,7 @@ union _x22i_4way_ctx_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
-#if defined(X22I_4WAY_SHA)
-    sph_sha256_context      sha256;
-#else
+#if !defined(X22I_4WAY_SHA)
     sha256_4way_context     sha256;
 #endif
 };
@@ -757,18 +737,10 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
 
 #if defined(X22I_4WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0, 64 );
-   sph_sha256_close( &ctx.sha256, output );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1, 64 );
-   sph_sha256_close( &ctx.sha256, output+32 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2, 64 );
-   sph_sha256_close( &ctx.sha256, output+64 );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3, 64 );
-   sph_sha256_close( &ctx.sha256, output+96 );
+   sha256_full( hash0, hash0, 64 );
+   sha256_full( hash1, hash1, 64 );
+   sha256_full( hash2, hash2, 64 );
+   sha256_full( hash3, hash3, 64 );
 
 #else
 
diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c
index 759e44c..d63ddf2 100644
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -24,6 +24,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -57,7 +58,6 @@ union _x22i_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
 };
 typedef union _x22i_context_overlay x22i_context_overlay;
 
@@ -172,9 +172,7 @@ int x22i_hash( void *output, const void *input, int thrid )
    sph_gost512 (&ctx.gost, (const void*) hash, 64);
    sph_gost512_close(&ctx.gost, (void*) hash);
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash, 64 );
-   sph_sha256_close( &ctx.sha256, hash );
+   sha256_full( hash, hash, 64 );
 
    memcpy(output, hash, 32);
 
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index 86f5699..ff2888e 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -33,7 +33,7 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 #if defined(__SHA__)
-  #include "algo/sha/sph_sha2.h"
+  #include "algo/sha/sha256-hash.h"
 #endif
 
 void x25x_shuffle( void *hash )
@@ -84,7 +84,7 @@ union _x25x_8way_ctx_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
 #if defined(X25X_8WAY_SHA)
-    sph_sha256_context      sha256;
+    sha256_context          sha256;
 #else
     sha256_8way_context     sha256;
 #endif
@@ -447,31 +447,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
 
 #if defined(X25X_8WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash0[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash1[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash2[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash3[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash4[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash4[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash5[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash5[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash6[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash6[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash7[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash7[21] );
-
+   sha256_full( hash0[21], hash0[20], 64 );
+   sha256_full( hash1[21], hash1[20], 64 );
+   sha256_full( hash2[21], hash2[20], 64 );
+   sha256_full( hash3[21], hash3[20], 64 );
+   sha256_full( hash4[21], hash4[20], 64 );
+   sha256_full( hash5[21], hash5[20], 64 );
+   sha256_full( hash6[21], hash6[20], 64 );
+   sha256_full( hash7[21], hash7[20], 64 );
+   
    intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
                            hash4[21], hash5[21], hash6[21], hash7[21] );
    
@@ -646,7 +630,7 @@ union _x25x_4way_ctx_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
 #if defined(X25X_4WAY_SHA)
-    sph_sha256_context      sha256;
+    sha256_context          sha256;
 #else
     sha256_4way_context     sha256;
 #endif
@@ -848,18 +832,10 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
 
 #if defined(X25X_4WAY_SHA)
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash0[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash0[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash1[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash1[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash2[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash2[21] );
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, hash3[20], 64 );
-   sph_sha256_close( &ctx.sha256, hash3[21] );
+   sha256_full( hash0[21], hash0[20], 64 );
+   sha256_full( hash1[21], hash1[20], 64 );
+   sha256_full( hash2[21], hash2[20], 64 );
+   sha256_full( hash3[21], hash3[20], 64 );
 
    intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );
 
diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c
index 42e7eda..aade6e2 100644
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -23,7 +23,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -60,7 +60,7 @@ union _x25x_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        sph_sha256_context      sha256;
+        sha256_context          sha256;
         sph_panama_context      panama;
         blake2s_state           blake2s;
 };
@@ -174,9 +174,7 @@ int x25x_hash( void *output, const void *input, int thrid )
    sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
    sph_gost512_close(&ctx.gost, (void*) &hash[20]);
 
-   sph_sha256_init( &ctx.sha256 );
-   sph_sha256( &ctx.sha256, &hash[20], 64 );
-   sph_sha256_close( &ctx.sha256, &hash[21] );
+   sha256_full( &hash[21], &hash[20], 64 );
 
    sph_panama_init(&ctx.panama);
    sph_panama (&ctx.panama, (const void*) &hash[21], 64 );
diff --git a/algo/yespower/crypto/blake2b-yp.c b/algo/yespower/crypto/blake2b-yp.c
index 407d2dd..dc6eee6 100644
--- a/algo/yespower/crypto/blake2b-yp.c
+++ b/algo/yespower/crypto/blake2b-yp.c
@@ -35,9 +35,11 @@
 #include "blake2b-yp.h"
 
 // Cyclic right rotation.
-#ifndef ROTR64
-#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-#endif
+//#ifndef ROTR64
+//#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+//#endif
+
+#define ROTR64(x, y) ror64( x, y )
 
 // Little-endian byte access.
 #define B2B_GET64(p)                            \
diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c
index 27d1fd8..b278c36 100644
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
     endiandata[19] = n;
 
 // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
     
     do {
        yespower_tls( (unsigned char *)endiandata, params.perslen,
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index 8c9a944..8968037 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -27,14 +27,11 @@
  * coin.
  */
 #include "yespower.h"
-
 #include "algo-gate-api.h"
 
 yespower_params_t yespower_params;
 
-//SHA256_CTX sha256_prehash_ctx;
-__thread sph_sha256_context sha256_prehash_ctx;
-//__thread SHA256_CTX sha256_prehash_ctx;
+__thread sha256_context sha256_prehash_ctx;
 
 // YESPOWER
 
@@ -61,8 +58,8 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
    endiandata[19] = n;
 
    // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
+   sha256_ctx_init( &sha256_prehash_ctx );
+   sha256_update( &sha256_prehash_ctx, endiandata, 64 );
 
    do {
       if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
@@ -101,10 +98,6 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
       be32enc( &endiandata[k], pdata[k] );
    endiandata[19] = n;
 
-   // do sha256 prehash
-   sph_sha256_init( &sha256_prehash_ctx );
-   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
-
    do {
       if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
       if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index fd16c24..5e725af 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -203,17 +203,17 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
 	ARX(X0, X3, X2, 18) \
 	/* Rearrange data */ \
 	X1 = _mm_shuffle_epi32(X1, 0x93); \
+   X3 = _mm_shuffle_epi32(X3, 0x39); \
 	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x39); \
 	/* Operate on "rows" */ \
 	ARX(X3, X0, X1, 7) \
 	ARX(X2, X3, X0, 9) \
 	ARX(X1, X2, X3, 13) \
 	ARX(X0, X1, X2, 18) \
 	/* Rearrange data */ \
+   X3 = _mm_shuffle_epi32(X3, 0x93); \
 	X1 = _mm_shuffle_epi32(X1, 0x39); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x93);
+	X2 = _mm_shuffle_epi32(X2, 0x4E);
 
 /**
  * Apply the Salsa20 core to the block provided in (X0 ... X3).
@@ -1095,7 +1095,7 @@ int yespower(yespower_local_t *local,
    salsa20_blk_t *V, *XY;
    pwxform_ctx_t ctx;
    uint8_t sha256[32];
-   sph_sha256_context sha256_ctx;
+   sha256_context sha256_ctx;
 
    /* Sanity-check parameters */
    if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
@@ -1138,10 +1138,9 @@ int yespower(yespower_local_t *local,
 
    // copy prehash, do tail   
    memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
-
-   sph_sha256( &sha256_ctx, src+64, srclen-64 );
-   sph_sha256_close( &sha256_ctx, sha256 );
-
+   sha256_update( &sha256_ctx, src+64, srclen-64 );
+   sha256_final( &sha256_ctx, sha256 );
+   
    if ( version == YESPOWER_0_5 )
    {
       PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
@@ -1186,7 +1185,9 @@ int yespower(yespower_local_t *local,
       if ( work_restart[thrid].restart ) return 0;
 
       smix_1_0( B, r, N, V, XY, &ctx );
-      
+
+      if ( work_restart[thrid].restart ) return 0;
+
       HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
                        (uint8_t *)dst );
    }
diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h
index 260322a..aa19004 100644
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -34,7 +34,7 @@
 #include <stdlib.h> /* for size_t */
 #include "miner.h"
 #include "simd-utils.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha/sha256-hash.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -78,9 +78,7 @@ typedef struct {
 
 extern yespower_params_t yespower_params;
 
-//SHA256_CTX sha256_prehash_ctx;
-extern __thread sph_sha256_context sha256_prehash_ctx;
-//extern __thread SHA256_CTX sha256_prehash_ctx;
+extern __thread sha256_context sha256_prehash_ctx;
 
 /**
  * yespower_init_local(local):
diff --git a/build-allarch.sh b/build-allarch.sh
index c4d9ffd..5fa38f6 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
 
 # Icelake AVX512 SHA VAES
 make distclean || echo clean
diff --git a/configure b/configure
index 7430186..db3efc9 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.17.1'
-PACKAGE_STRING='cpuminer-opt 3.17.1'
+PACKAGE_VERSION='3.18.0'
+PACKAGE_STRING='cpuminer-opt 3.18.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.17.1
+cpuminer-opt configure 3.18.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.17.1, which was
+It was created by cpuminer-opt $as_me 3.18.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.17.1'
+ VERSION='3.18.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.17.1, which was
+This file was extended by cpuminer-opt $as_me 3.18.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.17.1
+cpuminer-opt config.status 3.18.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 332d1e6..fbe5a9b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.17.1])
+AC_INIT([cpuminer-opt], [3.18.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e9c01fe..c889538 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -38,6 +38,7 @@
 #include <jansson.h>
 #include <openssl/sha.h>
 #include "sysinfos.c"
+#include "algo/sha/sha256d.h"
 
 #ifdef WIN32
 #include <winsock2.h>
@@ -94,6 +95,7 @@ bool have_gbt = true;
 bool allow_getwork = true;
 bool want_stratum = true;    // pretty useless
 bool have_stratum = false;
+bool stratum_down = true;
 bool allow_mininginfo = true;
 bool use_syslog = false;
 bool use_colors = true;
@@ -166,6 +168,8 @@ uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
 double *thr_hashrates;
 double global_hashrate = 0.;
+double total_hashes = 0.;
+struct timeval total_hashes_time = {0,0};
 double stratum_diff = 0.;
 double net_diff = 0.;
 double net_hashrate = 0.;
@@ -1001,6 +1005,7 @@ struct share_stats_t
    double share_diff;
    double stratum_diff;
    double target_diff;
+   uint32_t height;
    char   job_id[32];
 };
 
@@ -1080,13 +1085,14 @@ void report_summary_log( bool force )
    pthread_mutex_unlock( &stats_lock );
 
    timeval_subtract( &et, &now, &start_time );
-   timeval_subtract( &uptime, &now, &session_start );
+   timeval_subtract( &uptime, &total_hashes_time, &session_start );
    
    double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
-   double ghrate = global_hashrate;
+   double ghrate = safe_div( total_hashes, (double)uptime.tv_sec, 0. );
    double target_diff = exp32 * last_targetdiff;
    double shrate = safe_div( target_diff * (double)(accepts),
                              share_time, 0. );
+//   global_hashrate = ghrate;
    double sess_hrate = safe_div( exp32 * norm_diff_sum,
                                  (double)uptime.tv_sec, 0. );
    double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
@@ -1134,29 +1140,38 @@ void report_summary_log( bool force )
                       100. * safe_div( (double)accepted_share_count, 
                                        (double)submitted_share_count, 0. ) ); 
    if ( stale_share_count )
-      applog2( LOG_INFO, "Stale           %7d      %7d      %5.1f%%",
+   {
+      int prio = stales ? LOG_MINR : LOG_INFO;
+      applog2( prio, "Stale           %7d      %7d      %5.1f%%",
                       stales, stale_share_count,
                       100. * safe_div( (double)stale_share_count,
                                        (double)submitted_share_count, 0. ) );
+   }
    if ( rejected_share_count )
-      applog2( LOG_INFO, "Rejected        %7d      %7d      %5.1f%%",
+   {
+      int prio = rejects ? LOG_ERR : LOG_INFO;
+      applog2( prio, "Rejected        %7d      %7d      %5.1f%%",
                       rejects, rejected_share_count,
                       100. * safe_div( (double)rejected_share_count,
                                        (double)submitted_share_count, 0. ) );
+   }
    if ( solved_block_count )
-      applog2( LOG_INFO,"Blocks Solved   %7d      %7d",
+   {      
+      int prio = solved ? LOG_PINK : LOG_INFO;
+      applog2( prio, "Blocks Solved   %7d      %7d",
                solved, solved_block_count );
+   }
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
-               highest_share, lowest_share );
+            highest_share, lowest_share );
 
    int mismatch = submitted_share_count
          - ( accepted_share_count + stale_share_count + rejected_share_count );
    if ( mismatch )
    {
       if ( mismatch != 1 )
-         applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
+         applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
       else
-         applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
+         applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
    }
 }
 
@@ -1278,17 +1293,17 @@ static int share_result( int result, struct work *work,
 
    if ( use_colors )
    {
-     bcol = acol = scol = rcol = CL_WHT;
+     bcol = acol = scol = rcol = CL_N;
      if ( likely( result ) )
      {
-       acol = CL_WHT CL_GRN;  
-       if ( unlikely( solved ) ) bcol = CL_WHT CL_MAG;
+       acol = CL_LGR;       
+       if ( unlikely( solved ) ) bcol = CL_LMA;
      }        
-     else if ( stale ) scol = CL_WHT CL_YL2;
-     else              rcol = CL_WHT CL_RED;
+     else if ( stale ) scol = CL_YL2;
+     else              rcol = CL_LRD;
    }
 
-   applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
            bres, share_time, latency );
 
@@ -1296,8 +1311,7 @@ static int share_result( int result, struct work *work,
    {
       if ( have_stratum )
          applog2( LOG_INFO, "Diff %.5g, Block %d, Job %s",
-               my_stats.share_diff, stratum.block_height,
-               my_stats.job_id );
+               my_stats.share_diff, my_stats.height, my_stats.job_id );
       else
          applog2( LOG_INFO, "Diff %.5g, Block %d",
                my_stats.share_diff, work ? work->height : last_block_height );
@@ -1308,7 +1322,7 @@ static int share_result( int result, struct work *work,
       uint32_t str[8];
       uint32_t *targ;
 
-      if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
+      if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
          
       diff_to_hash( str, my_stats.share_diff );
       applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
@@ -1861,6 +1875,7 @@ static void update_submit_stats( struct work *work, const void *hash )
    share_stats[ s_put_ptr ].net_diff = net_diff;
    share_stats[ s_put_ptr ].stratum_diff = stratum_diff;
    share_stats[ s_put_ptr ].target_diff = work->targetdiff;
+   share_stats[ s_put_ptr ].height = work->height; 
    if ( have_stratum )
       strncpy( share_stats[ s_put_ptr ].job_id, work->job_id, 30 );
    s_put_ptr = stats_ptr_incr( s_put_ptr );
@@ -1871,6 +1886,10 @@ static void update_submit_stats( struct work *work, const void *hash )
 bool submit_solution( struct work *work, const void *hash,
                       struct thr_info *thr )
 {
+   // Job went stale during hashing of a valid share.
+   if ( !opt_quiet && work_restart[ thr->id ].restart )
+      applog( LOG_INFO, CL_LBL "Share may be stale, submitting anyway..." CL_N );
+   
    work->sharediff = hash_to_diff( hash );
    if ( likely( submit_work( thr, work ) ) )
    {
@@ -1887,11 +1906,11 @@ bool submit_solution( struct work *work, const void *hash,
      if ( !opt_quiet )
      {
         if ( have_stratum )
-           applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Job %s",
+           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Job %s",
                    submitted_share_count, work->sharediff, work->height,
                    work->job_id );
         else
-           applog( LOG_NOTICE, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
+           applog( LOG_INFO, "%d Submitted Diff %.5g, Block %d, Ntime %08x",
                    submitted_share_count, work->sharediff, work->height,
                    work->data[ algo_gate.ntime_index ] );
      }
@@ -2048,7 +2067,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
    pthread_rwlock_wrlock( &g_work_lock );
    pthread_mutex_lock( &sctx->work_lock );
 
-   new_job =  sctx->new_job;
+   new_job =  sctx->new_job;  // otherwise just increment extranonce2
    sctx->new_job = false;
    
    free( g_work->job_id );
@@ -2084,6 +2103,14 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
    pthread_mutex_unlock( &stats_lock );
 
+   if ( !opt_quiet )
+   {
+      int mismatch = submitted_share_count
+         - ( accepted_share_count + stale_share_count + rejected_share_count );
+      if ( mismatch )
+         applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
+   }
+
    if ( stratum_diff != sctx->job.diff )
       applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
                         sctx->job.diff, sctx->block_height, g_work->job_id );
@@ -2264,19 +2291,29 @@ static void *miner_thread( void *userdata )
    }
 
    // wait for stratum to send first job
-   if ( have_stratum ) while ( unlikely( !g_work.job_id ) ) sleep(1);
+   if ( have_stratum ) while ( unlikely( stratum_down ) )
+   {
+     if ( opt_debug )
+        applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
+     sleep(1);
+   }
 
+   // nominal startng values
+   int64_t max64 = 20;
+   thr_hashrates[thr_id] = 20;
    while (1)
    {
        uint64_t hashes_done;
        struct timeval tv_start, tv_end, diff;
-       int64_t max64 = 1000;
+//       int64_t max64 = 1000;
        int nonce_found = 0;
 
        if ( likely( algo_gate.do_this_thread( thr_id ) ) )
        {
-          if ( have_stratum )
+          if ( have_stratum ) 
           {
+             while ( unlikely( stratum_down ) )
+                sleep( 1 );
              if ( *nonceptr >= end_nonce )
                 stratum_gen_work( &stratum, &g_work );
           }
@@ -2383,6 +2420,8 @@ static void *miner_thread( void *userdata )
        if ( diff.tv_usec || diff.tv_sec )
        {
           pthread_mutex_lock( &stats_lock );
+          total_hashes += hashes_done;
+          total_hashes_time = tv_end;
           thr_hashrates[thr_id] =
           hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 );
           pthread_mutex_unlock( &stats_lock );
@@ -2439,7 +2478,6 @@ static void *miner_thread( void *userdata )
             && thr_id == opt_n_threads - 1 ) )
        {
           double hashrate  = 0.;
-
           pthread_mutex_lock( &stats_lock );
           for ( i = 0; i < opt_n_threads; i++ )
               hashrate  += thr_hashrates[i];
@@ -2448,8 +2486,12 @@ static void *miner_thread( void *userdata )
 
           if ( opt_benchmark )
           {
+             struct timeval uptime;
              char hr[16];
              char hr_units[2] = {0,0};
+             timeval_subtract( &uptime, &total_hashes_time, &session_start ); 
+             double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
+
              scale_hash_for_display( &hashrate,  hr_units );
              sprintf( hr, "%.2f", hashrate );
 #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
@@ -2745,6 +2787,7 @@ static void *stratum_thread(void *userdata )
       if ( unlikely( stratum_need_reset ) )
       {
           stratum_need_reset = false;
+          stratum_down = true;
           stratum_disconnect( &stratum );
           if ( strcmp( stratum.url, rpc_url ) )
           {
@@ -2755,11 +2798,13 @@ static void *stratum_thread(void *userdata )
           else 
 	          applog(LOG_WARNING, "Stratum connection reset");
           // reset stats queue as well
+          restart_threads();
           if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
       }
 
       while ( !stratum.curl )
       {
+         stratum_down = true;
          pthread_rwlock_wrlock( &g_work_lock );
          g_work_time = 0;
          pthread_rwlock_unlock( &g_work_lock );
@@ -2780,6 +2825,7 @@ static void *stratum_thread(void *userdata )
          }
          else
          {
+            stratum_down = false;
             restart_threads();
             applog(LOG_BLUE,"Stratum connection established" );
          }
@@ -2801,7 +2847,7 @@ static void *stratum_thread(void *userdata )
          }
          else
          {
-            applog(LOG_WARNING, "Stratum connection interrupted");
+//            applog(LOG_WARNING, "Stratum connection interrupted");
 //            stratum_disconnect( &stratum );
             stratum_need_reset = true;
          }
@@ -3629,6 +3675,10 @@ int main(int argc, char *argv[])
       show_usage_and_exit(1);
    }
 
+   // need to register to get algo optimizations for cpu capabilities
+   // but that causes register logs before cpu capabilities is output.
+   // Would need to split register into 2 parts. First part sets algo
+   // optimizations but no logging, second part does any logging.   
    if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
 
    if ( !check_cpu_capability() ) exit(1);
@@ -3685,12 +3735,6 @@ int main(int argc, char *argv[])
       }
    }
 
-   // Initialize stats times and counters
-   memset( share_stats, 0, s_stats_size *  sizeof (struct share_stats_t) );
-   gettimeofday( &last_submit_time, NULL );
-   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
-   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
-
 //   if ( !check_cpu_capability() ) exit(1);
 
 	pthread_mutex_init( &stats_lock, NULL );
@@ -3854,7 +3898,8 @@ int main(int argc, char *argv[])
 			return 1;
 		}
 	}
-	if ( have_stratum )
+
+   if ( have_stratum )
    {
       if ( opt_debug )
          applog(LOG_INFO,"Creating stratum thread");
@@ -3900,24 +3945,35 @@ int main(int argc, char *argv[])
                                                      opt_api_listen );
    }
 
+   // hold the stats lock while starting miner threads
+   pthread_mutex_lock( &stats_lock );
+   
 	/* start mining threads */
-	for (i = 0; i < opt_n_threads; i++)
+	for ( i = 0; i < opt_n_threads; i++ )
    {
-      usleep( 5000 );
+//      usleep( 5000 );
 		thr = &thr_info[i];
 		thr->id = i;
 		thr->q = tq_new();
-		if (!thr->q)
+		if ( !thr->q )
 			return 1;
-         err = thread_create(thr, miner_thread);
-		if (err) {
-			applog(LOG_ERR, "Miner thread %d create failed", i);
+      err = thread_create( thr, miner_thread );
+		if ( err )
+      {
+			applog( LOG_ERR, "Miner thread %d create failed", i );
 			return 1;
 		}
    }
 
-	applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
-	                  opt_n_threads, num_cpus, algo_names[opt_algo] );
+   // Initialize stats times and counters
+   memset( share_stats, 0, s_stats_size *  sizeof (struct share_stats_t) );
+   gettimeofday( &last_submit_time, NULL );
+   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   pthread_mutex_unlock( &stats_lock );
+
+   applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
+                     opt_n_threads, num_cpus, algo_names[opt_algo] );
 
 	/* main loop - simply wait for workio thread to exit */
 	pthread_join( thr_info[work_thr_id].pth, NULL );
diff --git a/miner.h b/miner.h
index 9ca56b8..5592d4a 100644
--- a/miner.h
+++ b/miner.h
@@ -70,17 +70,25 @@ void *alloca (size_t);
 
 #ifdef HAVE_SYSLOG_H
 #include <syslog.h>
-#define LOG_BLUE 0x10 /* unique value */
+#define LOG_BLUE  0x10 /* unique value */
+#define LOG_MAJR  0x11 /* unique value */
+#define LOG_MINR  0x12 /* unique value */
+#define LOG_GREEN 0x13 /* unique value */
+#define LOG_PINK  0x14 /* unique value */
 #else
 enum {
-	LOG_ERR,
+   LOG_CRIT,
+   LOG_ERR,
 	LOG_WARNING,
 	LOG_NOTICE,
 	LOG_INFO,
 	LOG_DEBUG,
-	/* custom notices */
-	LOG_BLUE = 0x10,
-};
+   /* custom notices */
+	LOG_BLUE  = 0x10,
+   LOG_MAJR  = 0x11,
+   LOG_MINR  = 0x12,
+   LOG_GREEN = 0x13,
+   LOG_PINK  = 0x14 };
 #endif
 
 extern bool is_power_of_2( int n );
@@ -216,7 +224,7 @@ json_t* json_load_url(char* cfg_url, json_error_t *err);
 
 void sha256_init(uint32_t *state);
 void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
-void sha256d(unsigned char *hash, const unsigned char *data, int len);
+//void sha256d(unsigned char *hash, const unsigned char *data, int len);
 
 #ifdef USE_ASM
 #if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
@@ -225,7 +233,8 @@ int sha256_use_4way();
 void sha256_init_4way(uint32_t *state);
 void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
 #endif
-#if defined(__x86_64__) && defined(USE_AVX2)
+//#if defined(__x86_64__) && defined(USE_AVX2)
+#if defined(__x86_64__) && defined(__AVX2__)
 #define HAVE_SHA256_8WAY 1
 int sha256_use_8way();
 void sha256_init_8way(uint32_t *state);
@@ -271,9 +280,9 @@ struct thr_api {
 #define CL_N    "\x1B[0m"
 #define CL_RED  "\x1B[31m"
 #define CL_GRN  "\x1B[32m"
-#define CL_YLW  "\x1B[33m"
+#define CL_YLW  "\x1B[33m"  // dark yellow
 #define CL_BLU  "\x1B[34m"
-#define CL_MAG  "\x1B[35m"
+#define CL_MAG  "\x1B[35m"  // purple
 #define CL_CYN  "\x1B[36m"
 
 #define CL_BLK  "\x1B[22;30m" /* black */
@@ -281,7 +290,7 @@ struct thr_api {
 #define CL_GR2  "\x1B[22;32m" /* green */
 #define CL_BRW  "\x1B[22;33m" /* brown */
 #define CL_BL2  "\x1B[22;34m" /* blue */
-#define CL_MA2  "\x1B[22;35m" /* magenta */
+#define CL_MA2  "\x1B[22;35m" /* purple */
 #define CL_CY2  "\x1B[22;36m" /* cyan */
 #define CL_SIL  "\x1B[22;37m" /* gray */
 
@@ -290,9 +299,9 @@ struct thr_api {
 #else
 #define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
 #endif
-#define CL_LRD  "\x1B[01;31m" /* light red */
-#define CL_LGR  "\x1B[01;32m" /* light green */
-#define CL_YL2  "\x1B[01;33m" /* yellow */
+#define CL_LRD  "\x1B[01;31m" /* bright red */
+#define CL_LGR  "\x1B[01;32m" /* bright green */
+#define CL_YL2  "\x1B[01;33m" /* bright yellow */
 #define CL_LBL  "\x1B[01;34m" /* light blue */
 #define CL_LMA  "\x1B[01;35m" /* light magenta */
 #define CL_LCY  "\x1B[01;36m" /* light cyan */
@@ -481,7 +490,7 @@ void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);
 
 void scale_hash_for_display ( double* hashrate, char* units );
-
+void format_number_si( double* hashrate, char* si_units );
 void report_summary_log( bool force );
 
 /*
diff --git a/simd-utils.h b/simd-utils.h
index 55cc552..f2e201d 100644
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -78,6 +78,8 @@
 //   - specialized shift and rotate functions that move elements around
 //     use the notation "1x32" to indicate the distance moved as units of
 //     the element size.
+//     Vector shuffle rotations are being renamed to "vrol" and "vror"
+//     to avoid confusion with bit rotations.
 //   - there is a subset of some functions for scalar data. They may have
 //     no prefix nor vec-size, just one size, the size of the data.
 //   - Some integer functions are also defined which use a similar notation.
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index cedcae3..956f3e3 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
    d0[24] = s[48];   d1[24] = s[49];   d0[25] = s[50];   d1[25] = s[51];
    d0[26] = s[52];   d1[26] = s[53];   d0[27] = s[54];   d1[27] = s[55];
    d0[28] = s[56];   d1[28] = s[57];   d0[29] = s[58];   d1[29] = s[59];
-   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[61];   d1[31] = s[63];
+   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[62];   d1[31] = s[63];
 }
 
 static inline void extr_lane_2x32( void *dst, const void *src,
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 90066f0..765d847 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -35,6 +35,13 @@
 ///////////////////////////////////////////////////////////////////////////
 
 
+// Used instead if casting.
+typedef union
+{
+   __m128i m128;
+   uint32_t u32[4];
+} __attribute__ ((aligned (16))) m128_ovly;
+
 // Efficient and convenient moving between GP & low bits of XMM.
 // Use VEX when available to give access to xmm8-15 and zero extend for
 // larger vectors.
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
   return  a;
 }
 
-static inline uint64_t mm128_mov128_64( const __m128i a )
+// Inconstant naming, prefix should reflect return value:
+// u64_mov128_64
+
+static inline uint64_t u64_mov128_64( const __m128i a )
 {
   uint64_t n;
 #if defined(__AVX__)
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
   return  n;
 }
 
-static inline uint32_t mm128_mov128_32( const __m128i a )
+static inline uint32_t u32_mov128_32( const __m128i a )
 {
   uint32_t n;
 #if defined(__AVX__)
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
 
 // Extract 32 bit element c from v and return as integer.
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
-{   return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
+{   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
 
 // Clear (zero) 32 bit elements based on bits set in 4 bit mask.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }
 
+// Move element i2 of v2 to element i1 of v1. For reference and convenience,
+// it's faster to precalculate the index.
+#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
+
 #endif  // SSE4_1
 
 //
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #endif
 
+
+
+// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+
+// Blend 4 32 bit elements from 4 vectors
+
+#if defined (__AVX2__)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
+                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
+
+#elif defined(__SSE4_1)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
+                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
+
+#endif
+
+
 //
 // Bit rotations
 
 // AVX512VL has implemented bit rotation for 128 bit vectors with
 // 64 and 32 bit elements.
 
+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.
+
 // compiler doesn't like when a variable is used for the last arg of
 // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
 // specification but works with a variable. Therefore use rol_var where
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32
 
+#define mm128_rorx2_64( v1, v0, c ) \
+   _mm_ror_epi64( v0, c ); \
+   _mm_ror_epi64( v1, c )
+
+#define mm128_rolx2_64( v1, v0, c ) \
+   _mm_rol_epi64( v0, c ); \
+   _mm_rol_epi64( v1, c )
+
+#define mm128_rorx2_32( v1, v0, c ) \
+   _mm_ror_epi32( v0, c ); \
+   _mm_ror_epi32( v1, c )
+
+#define mm128_rolx2_32( v1, v0, c ) \
+   _mm_rol_epi32( v0, c ); \
+   _mm_rol_epi32( v1, c )
+
 #else  // SSE2
 
 #define mm128_ror_64   mm128_ror_var_64
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32
 
+#define mm128_rorx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi64( v0, c ); \
+ __m128i t1 = _mm_srli_epi64( v1, c ); \
+ v0 = _mm_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi64( v0, c ); \
+ __m128i t1 = _mm_slli_epi64( v1, c ); \
+ v0 = _mm_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rorx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi32( v0, c ); \
+ __m128i t1 = _mm_srli_epi32( v1, c ); \
+ v0 = _mm_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi32( v0, c ); \
+ __m128i t1 = _mm_slli_epi32( v1, c ); \
+ v0 = _mm_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
 #endif   // AVX512 else SSE2
 
 #define mm128_ror_16( v, c ) \
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes
 
 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
-#define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
+#define mm128_shuflr_64       mm128_swap_64
+#define mm128_shufll_64       mm128_swap_64
+
+#define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
+#define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
+
 
 // Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32 mm128_swap64_32
+#define mm128_shufll64_32 mm128_swap64_32
 
 #if defined(__SSSE3__)
 
 // Rotate right by c bytes, no SSE2 equivalent.
-static inline __m128i mm128_ror_x8( const __m128i v, const int c )
+static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }
 
 //
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
    v1 = _mm_xor_si128( v1, v2 );
 
 
+// Two input shuffle-rotate.
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
-#if defined(__SSE4_1__)
+// Continue to use vror/vrol for now to avoid confusion with
+// shufl2r/shufl2l function macros available with AVX512.
 
-#define mm128_ror256_64( v1, v2 ) \
+#if defined(__SSSE3__)
+
+// Function macro with two inputs and one output, inputs are preserved.
+// Returns modified first arg.
+// Two input functions are not available without SSSE3. Use procedure
+// belowe instead.
+
+#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
+
+#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
+#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
+
+#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
+#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
+
+#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
+
+// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
+// Returns both modified args in place.
+
+// These macros retain the vrol/vror name for now to avoid
+// confusion with the shufl2r/shuffle2l function macros above.
+// These may be renamed to something like shufl2r2 for 2 1nputs and
+// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
+
+#define mm128_vror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v1 = _mm_alignr_epi8( v2, v1, 8 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
            v2 = _mm_alignr_epi8( v2, v1, 8 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
            v1 = _mm_alignr_epi8( v2, v1, 4 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
            v2 = _mm_alignr_epi8( v2, v1, 12 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
            v1 = _mm_alignr_epi8( v2, v1, 2 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
            v2 = _mm_alignr_epi8( v2, v1, 14 ); \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
            v1 = _mm_alignr_epi8( v2, v1, 1 ); \
            v2 = t; \
 } while(0)
 
-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
            v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -483,7 +614,7 @@ do { \
 
 #else  // SSE2
 
-#define mm128_ror256_64( v1, v2 ) \
+#define mm128_vror256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                               _mm_slli_si128( v2, 8 ) ); \
@@ -492,7 +623,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                               _mm_srli_si128( v2, 8 ) ); \
@@ -501,7 +632,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                               _mm_slli_si128( v2, 12 ) ); \
@@ -510,7 +641,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                               _mm_srli_si128( v2, 12 ) ); \
@@ -519,7 +650,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                               _mm_slli_si128( v2, 14 ) ); \
@@ -528,7 +659,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                               _mm_srli_si128( v2, 14 ) ); \
@@ -537,7 +668,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                               _mm_slli_si128( v2, 15 ) ); \
@@ -546,7 +677,7 @@ do { \
            v1 = t; \
 } while(0)
 
-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
    __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                               _mm_srli_si128( v2, 15 ) ); \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 125e2c8..3d84010 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -14,13 +14,28 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
 
+// Used instead if casting.
+typedef union
+{
+   __m256i m256;
+   __m128i m128[2];
+   uint64_t u64[4];
+   uint32_t u32[8];
+} __attribute__ ((aligned (32))) m256_ovly;
+
+
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
 #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
 
 // Move low element of vector to integer.
-#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
-#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
+#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
+#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
+
+// deprecated
+//#define mm256_mov256_64 u64_mov256_64 
+//#define mm256_mov256_32 u32_mov256_32
+
 
 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #endif
 
+// Diagonal blending
+
+// Blend 4 64 bit elements from 4 vectors
+#define mm256_diagonal_64( v3, v2, v1, v0 ) \
+  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
+                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
+
+// Blend 8 32 bit elements from 8 vectors
+#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm256_blend_epi32( \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v7, v6, 0x40 ), \
+               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v3, v2, 0x04) \
+               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm256_blend_epi32( \
+           _mm256_blend_epi32( v3, v2, 0x44) \
+           _mm256_blend_epi32( v1, v0, 0x11 ) )  
+
+
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128.
+// The only bit shift for more than 64 bits is with __int128 which is slow.
 //
 // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
+//
+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.
 
 
 // compiler doesn't like when a variable is used for the last arg of
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32
 
+#define mm256_rorx2_64( v1, v0, c ) \
+   _mm256_ror_epi64( v0, c ); \
+   _mm256_ror_epi64( v1, c )
+
+#define mm256_rolx2_64( v1, v0, c ) \
+   _mm256_rol_epi64( v0, c ); \
+   _mm256_rol_epi64( v1, c )
+
+#define mm256_rorx2_32( v1, v0, c ) \
+   _mm256_ror_epi32( v0, c ); \
+   _mm256_ror_epi32( v1, c )
+
+#define mm256_rolx2_32( v1, v0, c ) \
+   _mm256_rol_epi32( v0, c ); \
+   _mm256_rol_epi32( v1, c )
+
 #else   // AVX2
 
 #define mm256_ror_64    mm256_ror_var_64 
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32
 
+#define mm256_rorx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi64( v0, c ); \
+ __m256i t1 = _mm256_srli_epi64( v1, c ); \
+ v0 = _mm256_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi64( v0, c ); \
+ __m256i t1 = _mm256_slli_epi64( v1, c ); \
+ v0 = _mm256_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rorx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi32( v0, c ); \
+ __m256i t1 = _mm256_srli_epi32( v1, c ); \
+ v0 = _mm256_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi32( v0, c ); \
+ __m256i t1 = _mm256_slli_epi32( v1, c ); \
+ v0 = _mm256_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
 #endif     // AVX512 else AVX2
 
 #define  mm256_ror_16( v, c ) \
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements accross all lanes.
 
-#if defined(__AVX512VL__)
-
-static inline __m256i mm256_swap_128( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 2 ); }
-
-static inline __m256i mm256_ror_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 3 ); }
-
-static inline __m256i mm256_ror_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 7 ); }
-
-#else   // AVX2
-
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+#define mm256_shuflr_128 mm256_swap_128
+#define mm256_shufll_128 mm256_swap_128
 
 // Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+#define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
+
+#define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )
 
 // Rotate 256 bit vector by one 32 bit element.
-#define mm256_ror_1x32( v ) \
+#define mm256_shuflr_32( v ) \
     _mm256_permutevar8x32_epi32( v, \
                      m256_const_64( 0x0000000000000007, 0x0000000600000005, \
-                                    0x0000000400000003, 0x0000000200000001 )
+                                    0x0000000400000003, 0x0000000200000001 ) )
 
-#define mm256_rol_1x32( v ) \
+#define mm256_shufll_32( v ) \
     _mm256_permutevar8x32_epi32( v, \
                      m256_const_64( 0x0000000600000005,  0x0000000400000003, \
-                                    0x0000000200000001,  0x0000000000000007 )
+                                    0x0000000200000001,  0x0000000000000007 ) )
 
        
-#endif    // AVX512 else AVX2
-
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
-#define mm256_ror128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_rol128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_shuflr128_64 mm256_swap128_64
+#define mm256_shufll128_64 mm256_swap128_64
 
-static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
+#define mm256_shuflr128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_shufll128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+
+static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 
 // Swap 32 bit elements in each 64 bit lane.
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32 mm256_swap64_32
+#define mm256_shufll64_32 mm256_swap64_32
 
 //
 // Swap bytes in vector elements, endian bswap.
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.
 
+// continue using vror/vrol notation for now to avoid confusion with
+// shufl2r/shufl2l macro functions available with AVX512.
 #define mm256_swap512_256( v1, v2 ) \
    v1 = _mm256_xor_si256( v1, v2 ); \
    v2 = _mm256_xor_si256( v1, v2 ); \
    v1 = _mm256_xor_si256( v1, v2 );
 
-#define mm256_ror512_128( v1, v2 ) \
+#define mm256_vror512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
    v2 = t; \
 } while(0)
 
-#define mm256_rol512_128( v1, v2 ) \
+#define mm256_vrol512_128( v1, v2 ) \
 do { \
    __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
    v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index e6b7ac2..de948cc 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -74,13 +74,22 @@
 // __AVX512VBMI__  __AVX512VAES__
 //
 
+// Used instead if casting.
+typedef union
+{
+   __m512i m512;
+   __m128i m128[4];
+   uint32_t u32[16];
+   uint64_t u64[8];
+} __attribute__ ((aligned (64))) m512_ovly;
+
 // Move integer to/from element 0 of vector.
 
 #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
 #define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
 
-#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
-#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
+#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
+#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )
 
 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
    _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
 
+#define m512_const_128( v3, v2, v1, v0 ) \
+   mm512_concat_256( mm256_concat_128( v3, v2 ), \
+                     mm256_concat_128( v1, v0 ) )
+
 // Equivalent of set, assign 64 bit integers to respective 64 bit elements.
 // Use stack memory overlay
 static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 
 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
-// operation using any number or combinations of AND, OR XOR, NOT.
+// expression using any number or combinations of AND, OR, XOR, NOT.
 
 // a ^ b ^ c
 #define mm512_xor3( a, b, c ) \
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_andxor( a, b, c ) \
    _mm512_ternarylogic_epi64( a, b, c, 0x60 )
 
-// a ^ ( b & c )
+// a ^ ( b | c )
 #define mm512_xoror( a, b, c ) \
    _mm512_ternarylogic_epi64( a, b, c, 0x1e )
 
-// a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
+// a ^ ( ~b & c )     xor( a, andnot( b, c ) )
 #define mm512_xorandnot( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 
 
@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 
 // Some 2 input operations that don't have their own instruction mnemonic.
 
-// ~( a | b )
+// ~( a | b ),  (~a) & (~b)
 #define mm512_nor( a, b ) \
    _mm512_ternarylogic_epi64( a, b, b, 0x01  )
 
-// ~( a ^ b ), same as (~a) ^ b
+// ~( a ^ b ),  (~a) ^ b
 #define mm512_xnor( a, b ) \
    _mm512_ternarylogic_epi64( a, b, b, 0x81  )
 
@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
    _mm512_ternarylogic_epi64( a, b, b, 0xef  )
 
 
+// Diagonal blending
+// Blend 8 64 bit elements from 8 vectors
+#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm512_mask_blend_epi64( 0x0f, \
+        _mm512_mask_blend_epi64( 0x30, \
+               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
+               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
+        _mm512_mask_blend_epi64( 0x03, \
+               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
+               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm512_mask_blend_epi32( 0x3333, \
+           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
+           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
+
+
+
+
 // Bit rotations.
 
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
   casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)
 
-
 //
-// Rotate elements in 512 bit vector.
+// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
+//
+
+// rename plan change ror to vror for Vector ROtate Right,
+// and vrol for Vector ROtate Left, not to be confused with
+//variable rotate rorv, rolv,
+// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
+// operation. 1xNN notaion ia also removed and replaced with simpler NN.
+// Swap will still have its own mnemonic and will be aliased as both
+// left and right shuffles.
+
+// Shift elements right or left in 512 bit vector, filling with zeros.
+// Multiple element shifts can be combined into a single larger
+// element shift.
+
+#define mm512_shiftr_256( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
+#define mm512_shiftl_256( v ) mm512_shifr_256
+
+#define mm512_shiftr_128( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
+#define mm512_shiftl_128( v ) \
+  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
+
+#define mm512_shiftr_64( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
+#define mm512_shiftl_64( v ) \
+  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
+
+#define mm512_shiftr_32( v ) \
+  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
+#define mm512_shiftl_32( v ) \
+  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
+
+// Shuffle-rotate elements left or right in 512 bit vector.
 
 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
+#define mm512_shuflr_256( v ) mm512_swap_256
+#define mm512_shufll_256( v ) mm512_swap_256
 
-static inline __m512i mm512_ror_1x128( const __m512i v )
+static inline __m512i mm512_shuflr_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 2 ); }
 
-static inline __m512i mm512_rol_1x128( const __m512i v )
+static inline __m512i mm512_shufll_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 6 ); }
 
-static inline __m512i mm512_ror_1x64( const __m512i v )
+static inline __m512i mm512_shuflr_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 1 ); }
 
-static inline __m512i mm512_rol_1x64( const __m512i v )
+static inline __m512i mm512_shufll_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 7 ); }
 
-static inline __m512i mm512_ror_1x32( const __m512i v )
+static inline __m512i mm512_shuflr_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 1 ); }
 
-static inline __m512i mm512_rol_1x32( const __m512i v )
+static inline __m512i mm512_shufll_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 15 ); }
 
-static inline __m512i mm512_ror_x64( const __m512i v, const int n )
+// Generic
+static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }
 
-static inline __m512i mm512_ror_x32( const __m512i v, const int n )
+static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }
 
-#define mm512_ror_1x16( v ) \
+#define mm512_shuflr_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                        0x0000001F001E001D, 0x001C001B001A0019, \
                        0X0018001700160015, 0X0014001300120011, \
                        0X0010000F000E000D, 0X000C000B000A0009, \
                        0X0008000700060005, 0X0004000300020001 ), v )
 
-#define mm512_rol_1x16( v ) \
+#define mm512_shufll_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                        0x001E001D001C001B, 0x001A001900180017, \
                        0X0016001500140013, 0X001200110010000F, \
                        0X000E000D000C000B, 0X000A000900080007, \
                        0X0006000500040003, 0X000200010000001F ), v )
 
-#define mm512_ror_1x8( v ) \
+#define mm512_shuflr_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                        0x003F3E3D3C3B3A39, 0x3837363534333231, \
                        0x302F2E2D2C2B2A29, 0x2827262524232221, \
                        0x201F1E1D1C1B1A19. 0x1817161514131211, \
                        0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
 
-#define mm512_rol_1x8( v ) \
+#define mm512_shufll_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                        0x3E3D3C3B3A393837, 0x363534333231302F. \
                        0x2E2D2C2B2A292827, 0x262524232221201F, \
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
 
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
+// 128 bit lane shift is handled by bslli bsrli.
 
 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_shuflr256_128 mm512_swap256_128
+#define mm512_shufll256_128 mm512_swap256_128
 
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol256_64( v )     _mm512_permutex_epi64( v, 0x93 )
+#define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
+
+#define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 
 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror256_32( v ) \
+#define mm512_shuflr256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x000000080000000f, 0x0000000e0000000d, \
                       0x0000000c0000000b, 0x0000000a00000009, \
                       0x0000000000000007, 0x0000000600000005, \
                       0x0000000400000003, 0x0000000200000001 ), v )
 
-#define mm512_rol256_32( v ) \
+#define mm512_shufll256_32( v ) \
    _mm512_permutexvar_epi32( m512_const_64( \
                       0x0000000e0000000d, 0x0000000c0000000b, \
                       0x0000000a00000009, 0x000000080000000f, \
                       0x0000000600000005, 0x0000000400000003, \
                       0x0000000200000001, 0x0000000000000007 ), v )
 
-#define mm512_ror256_16( v ) \
+#define mm512_shuflr256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x00100001001e001d, 0x001c001b001a0019, \
                      0x0018001700160015, 0x0014001300120011, \
                      0x0000000f000e000d, 0x000c000b000a0009, \
                      0x0008000700060005, 0x0004000300020001 ), v )
 
-#define mm512_rol256_16( v ) \
+#define mm512_shufll256_16( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001e001d001c001b, 0x001a001900180017, \
                      0x0016001500140013, 0x001200110010001f, \
                      0x000e000d000c000b, 0x000a000900080007, \
                      0x0006000500040003, 0x000200010000000f ), v )
 
-#define mm512_ror256_8( v ) \
+#define mm512_shuflr256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x203f3e3d3c3b3a39, 0x3837363534333231, \
                      0x302f2e2d2c2b2a29, 0x2827262524232221, \
                      0x001f1e1d1c1b1a19, 0x1817161514131211, \
                      0x100f0e0d0c0b0a09, 0x0807060504030201 ) )
 
-#define mm512_rol256_8( v ) \
+#define mm512_shufll256_8( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x3e3d3c3b3a393837, 0x363534333231302f, \
                      0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
                      0x0e0d0c0b0a090807, 0x060504030201001f ) )
 
 //
-// Rotate elements within 128 bit lanes of 512 bit vector.
-
+// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+ 
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64  mm512_swap128_64
+#define mm512_shufll128_64  mm512_swap128_64
 
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
 
-// Rotate right 128 bit lanes by c bytes
-static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
+// Rotate right 128 bit lanes by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
 
-// Swap 32 bits in each 64 bit lane.
+// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
+// but only with AVX512. Shuffle is just as fast and availble with AVX2
+// & SSE2.
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_shuflr64_32 mm512_swap64_32
+#define mm512_shufll64_32 mm512_swap64_32
 
-
+// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
+// and 2 input 2 output shuffle macros.
 //
-//  Rotate elements from 2 512 bit vectors in place, source arguments
+// shuflr is 1 input
+// shufl2r is 2 input ...
+// Drop macros? They can easilly be rebuilt using shufl2 functions
+
+// add shuflr shufll functions performing rotate, returning first arg
+// They're faster than doing both, when both not needed.
+
+// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
+// rotated v1 
+// visually confusing for shif2r because of arg order. First arg is always
+// the target for modification, either update by reference or by function
+// return.
+#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
+#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
+
+#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
+#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
+
+#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
+#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
+
+#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
+#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
+
+// Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.
 
 #define mm512_swap1024_512( v1, v2 ) \
    v1 = _mm512_xor_si512( v1, v2 ); \
    v2 = _mm512_xor_si512( v1, v2 ); \
    v1 = _mm512_xor_si512( v1, v2 );
+#define mm512_shufl2l_512 mm512_swap1024_512 \
+#define mm512_shufl2r_512 mm512_swap1024_512 \
 
-#define mm512_ror1024_256( v1, v2 ) \
+// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
+// for now.
+//  Rotate elements from 2 512 bit vectors in place, both source arguments
+//  are updated.
+
+#define mm512_vror1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_256( v1, v2 ) \
+#define mm512_vrol1024_256( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_128( v1, v2 ) \
+#define mm512_vror1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_128( v1, v2 ) \
+#define mm512_vrol1024_128( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_64( v1, v2 ) \
+#define mm512_vror1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_64( v1, v2 ) \
+#define mm512_vrol1024_64( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
    v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
    v1 = t; \
 } while(0)
 
-#define mm512_ror1024_32( v1, v2 ) \
+#define mm512_vror1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
    v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
    v2 = t; \
 } while(0)
 
-#define mm512_rol1024_32( v1, v2 ) \
+#define mm512_vrol1024_32( v1, v2 ) \
 do { \
    __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
    v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h
index e74066b..31b0b89 100644
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -68,13 +68,13 @@
 // rotation.
 
 // Swap hi & lo 32 bits.
-#define mm64_swap_32( a )     _mm_shuffle_pi16( a, 0x4e )
+#define mm64_swap_32( a )      _mm_shuffle_pi16( a, 0x4e )
 
-#define mm64_ror64_1x16( a )  _mm_shuffle_pi16( a, 0x39 ) 
-#define mm64_rol64_1x16( a )  _mm_shuffle_pi16( a, 0x93 ) 
+#define mm64_shulfr_16( a )  _mm_shuffle_pi16( a, 0x39 ) 
+#define mm64_shufll_16( a )  _mm_shuffle_pi16( a, 0x93 ) 
 
 // Swap hi & lo 16 bits of each 32 bit element
-#define mm64_swap32_16( a )  _mm_shuffle_pi16( a, 0xb1 )
+#define mm64_swap32_16( a )    _mm_shuffle_pi16( a, 0xb1 )
 
 #if defined(__SSSE3__)
 
@@ -86,7 +86,7 @@
     _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
 
 // Rotate right by c bytes
-static inline __m64 mm64_ror_x8( __m64 v, const int c )
+static inline __m64 mm64_vror_x8( __m64 v, const int c )
 { return _mm_alignr_pi8( v, v, c ); }
 
 #else
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 4a7188e..601c750 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -5,10 +5,19 @@
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )
 
-// safe division, integer or floating point
+// Safe division, integer or floating point. For floating point it's as  
+// safe as 0. is precisely zero.
+// Returns safe_result if division by zero.
 #define safe_div( dividend, divisor, safe_result ) \
    ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )
 
+// Aliases with familiar names for built in bit rotate instructions
+#define rol64( a, n )   _lrotl( a, n )  
+#define ror64( a, n )   _lrotr( a, n )
+#define rol32( a, n )   _rotl( a, n )
+#define ror32( a, n )   _rotr( a, n )
+#define rol16( a, n )   _rotwl( a, n )
+#define ror16( a, n )   _rotwr( a, n )
 
 ///////////////////////////////////////
 // 
@@ -29,12 +38,14 @@
 // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
 // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
 
+// obsolete test
 // Compiler check for __int128 support
 // Configure also has a test for int128.
 #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
   #define GCC_INT128 1
 #endif
 
+// obsolte test
 #if !defined(GCC_INT128)
   #warning "__int128 not supported, requires GCC-4.8 or newer."
 #endif
diff --git a/sysinfos.c b/sysinfos.c
index 010c78f..ed453e2 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -218,7 +218,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
       for (int i = 2; i <= (ext & 0xF); i++)
       {
          cpuid(0x80000000+i, output);
-	 memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
+         memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
       }
       snprintf(outbuf, maxsz, "%s", brand);
    }
diff --git a/util.c b/util.c
index b96c4fe..31b9270 100644
--- a/util.c
+++ b/util.c
@@ -47,6 +47,7 @@
 //#include "miner.h"
 #include "elist.h"
 #include "algo-gate-api.h"
+#include "algo/sha/sha256d.h"
 
 //extern pthread_mutex_t stats_lock;
 
@@ -129,17 +130,19 @@ void applog2( int prio, const char *fmt, ... )
 
 //    localtime_r(&now, &tm);
 
-      switch (prio) {
+      switch ( prio )
+      {
+         case LOG_CRIT:    color = CL_LRD; break;
          case LOG_ERR:     color = CL_RED; break;
-         case LOG_WARNING: color = CL_YLW; break;
+         case LOG_WARNING: color = CL_YL2; break;
+         case LOG_MAJR:    color = CL_YL2; break;
          case LOG_NOTICE:  color = CL_WHT; break;
          case LOG_INFO:    color = ""; break;
          case LOG_DEBUG:   color = CL_GRY; break;
-
-         case LOG_BLUE:
-            prio = LOG_NOTICE;
-            color = CL_CYN;
-            break;
+         case LOG_MINR:    color = CL_YLW; break;
+         case LOG_GREEN:   color = CL_GRN; prio = LOG_INFO; break;
+         case LOG_BLUE:    color = CL_CYN; prio = LOG_NOTICE; break;
+         case LOG_PINK:    color = CL_LMA; prio = LOG_NOTICE; break;
       }
       if (!use_colors)
          color = "";
@@ -206,17 +209,19 @@ void applog(int prio, const char *fmt, ...)
 
 		localtime_r(&now, &tm);
 
-		switch (prio) {
-			case LOG_ERR:     color = CL_RED; break;
-			case LOG_WARNING: color = CL_YLW; break;
+		switch ( prio )
+      {
+         case LOG_CRIT:    color = CL_LRD; break;
+         case LOG_ERR:     color = CL_RED; break;
+			case LOG_WARNING: color = CL_YL2; break;
+         case LOG_MAJR:    color = CL_YL2; break;
 			case LOG_NOTICE:  color = CL_WHT; break;
-			case LOG_INFO:    color = ""; break;
+			case LOG_INFO:    color = "";     break;
 			case LOG_DEBUG:   color = CL_GRY; break;
-
-			case LOG_BLUE:
-				prio = LOG_NOTICE;
-				color = CL_CYN;
-				break;
+         case LOG_MINR:    color = CL_YLW; break;
+         case LOG_GREEN:   color = CL_GRN; prio = LOG_INFO;  break;
+			case LOG_BLUE:    color = CL_CYN; prio = LOG_NOTICE; break;
+         case LOG_PINK:    color = CL_LMA; prio = LOG_NOTICE; break;
 		}
 		if (!use_colors)
 			color = "";
@@ -303,6 +308,29 @@ void format_hashrate(double hashrate, char *output)
 	);
 }
 
+// For use with MiB etc
+void format_number_si( double* n, char* si_units )
+{
+  if ( *n < 1024*10 )  {  *si_units = 0;   return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'k'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'M'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'G'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'T'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'P'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'E'; return;  }
+  *n /= 1024;
+  if ( *n < 1024*10 )  {  *si_units = 'Z'; return;  }
+  *n /= 1024;
+  *si_units = 'Y';
+}
+
+
 /* Modify the representation of integer numbers which would cause an overflow
  * so that they are treated as floating-point numbers.
  * This is a hack to overcome the limitations of some versions of Jansson. */