diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS
index 02a829e..b61f091 100644
--- a/INSTALL_WINDOWS
+++ b/INSTALL_WINDOWS
@@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib
    version available in the repositories.
 
 Download the following source code packages from their respective and
-respected download locations, copy them to ~/usr/lib/ and uncompress them. 
+respected download locations, copy them to $HOME/usr/lib/ and uncompress them. 
 
 openssl: https://github.com/openssl/openssl/releases
 
@@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct
 
 Run cpuminer
 
-In a command windows change directories to the unzipped release folder. to get a list of all options:
+In a command windows change directories to the unzipped release folder. To get a list of all options:
 
 cpuminer.exe --help
 
 Command options are specific to where you mine. Refer to the pool's instructions on how to set them.
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Create a link to the locally compiled version of gmp.h
-
-$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
-
-Edit configure.ac to fix lipthread package name.
-
-sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
-
-7. Compile
-
-you can use the default compile if you intend to use cpuminer-opt on the
-same CPU and the virtual machine supports that architecture.
-
-./build.sh
-
-Otherwise you can compile manually while setting options in CFLAGS.
-
-Some common options:
-
-To compile for a specific CPU architecture:
-
-CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
-
-This will compile for AMD Ryzen.
-
-You can compile more generically for a set of specific CPU features
-if you know what features you want:
-
-CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
-
-This will compile for an older CPU that does not have AVX.
-
-You can find several examples in build-allarch.sh
-
-If you have a CPU with more than 64 threads and Windows 7 or higher you
-can enable the CPU Groups feature:
-
--D_WIN32_WINNT==0x0601
-
-Once you have run configure successfully run make with n CPU threads:
-
-make -j n
-
-Copy cpuminer.exe to the release directory, compress and copy the release
-directory to a Windows system and run cpuminer.exe from the command line.
-
-Run cpuminer
-
-In a command windows change directories to the unzipped release folder.
-to get a list of all options:
-
-cpuminer.exe --help
-
-Command options are specific to where you mine. Refer to the pool's
-instructions on how to set them.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index b3b4878..ce7752b 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,7 +65,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
-v3.8.2
+v3.19.0
+
+Windows binaries now built with support for CPU groups, requires Windows 7.
+
+Changes to cpu-affinity:
+  - PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
+  - added support for CPU affinity for up to 256 threads or CPUs,
+  - streamlined code for more efficient initialization of miner threads,
+  - precise affining of each miner thread to a specific CPU,
+  - added an option to disable CPU affinity with "--cpu-affinity 0"
+
+Faster sha256t with AVX512 & AVX2.
+
+Added stratum error count to stats log, reported only when non-zero.
+
+v3.18.2
 
 Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 63a8f92..6428e2b 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -62,6 +62,12 @@ void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
 void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
                             const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                     const __m128i *state_in );
 
 #endif  // SSE2
 
@@ -84,10 +90,12 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                                const __m256i *state_in );
 
-void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
-                             const __m256i *state_in );
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-                          const __m256i *state_in, const __m256i *state_mid );
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                                     const __m256i *state_in );
 
 #endif  // AVX2
 
@@ -109,10 +117,13 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
 void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
-                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-                          const __m512i *state_in, const __m512i *state_mid );
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                                     const __m512i *state_in );
 
 #endif // AVX512
 
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 63651c3..ef15273 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -611,11 +611,11 @@ static inline int scanhash_sha256d_8way_pooler( struct work *work,
 
 #endif /* HAVE_SHA256_8WAY */
 
-int scanhash_sha256d_pooler( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_sha256d_pooler( struct work *work,	uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t _ALIGN(128) data[64];
 	uint32_t _ALIGN(32) hash[8];
 	uint32_t _ALIGN(32) midstate[8];
@@ -626,12 +626,12 @@ int scanhash_sha256d_pooler( struct work *work,
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
 #ifdef HAVE_SHA256_8WAY
-	if (sha256_use_8way())
-		return scanhash_sha256d_8way_pooler( work,	max_nonce, hashes_done, mythr );
+	if ( sha256_use_8way() )
+		return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		return scanhash_sha256d_4way_pooler( work,	max_nonce, hashes_done, mythr );
+	if ( sha256_use_4way() )
+		return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 	
 	memcpy(data, pdata + 16, 64);
@@ -695,8 +695,11 @@ bool register_sha256d_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256D_16WAY)
    gate->scanhash = (void*)&scanhash_sha256d_16way;
+//#elif defined(SHA256D_8WAY)
+//   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
    gate->scanhash = (void*)&scanhash_sha256d_pooler;
+//   gate->scanhash = (void*)&scanhash_sha256d_4way;
 #endif
    //   gate->hash     = (void*)&sha256d;
    return true;
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index b520746..dd96d79 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -1,34 +1,3 @@
-/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * SHA-384 / SHA-512 implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
 
 #if defined(__SSE2__)
 
@@ -66,10 +35,7 @@ static const uint32_t K256[64] =
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
-// SHA-256 4 way
-
-#define SHA2s_MEXP( a, b, c, d ) \
-  mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] );
+// SHA-256 4 way SSE2
 
 #define CHs(X, Y, Z) \
    _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
@@ -94,6 +60,27 @@ static const uint32_t K256[64] =
    _mm_xor_si128( _mm_xor_si128( \
         mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
 
+#define SHA2s_MEXP( a, b, c, d ) \
+  mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
+
+#define SHA256x4_MSG_EXPANSION( W ) \
+   W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+   W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+   W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+   W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+   W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+   W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+   W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+   W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+   W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+   W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+   W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+   W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+   W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); \
+   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); \
+   W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); \
+   W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
   __m128i T1, T2; \
@@ -106,11 +93,32 @@ do { \
   H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+{ \
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); \
+}
+
 // LE data, no need to byte swap
 static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
                                           const __m128i *in )
 {
-   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   __m128i A, B, C, D, E, F, G, H;
 
    A = in[0];
    B = in[1];
@@ -120,61 +128,14 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
    F = in[5];
    G = in[6];
    H = in[7];
-   Y_xor_Z = _mm_xor_si128( B, C );
 
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
    
    out[0] = _mm_add_epi32( in[0], A );
    out[1] = _mm_add_epi32( in[1], B );
@@ -205,6 +166,245 @@ void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
    SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H;
+
+   // precalculate constant part msg expansion for second iteration.
+   X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ),
+                          W[ 2] );
+   X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ),
+                          SSG2_0( W[ 4] ) );
+   X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ),
+                          W[ 4] );
+   X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ),
+                          W[ 5] );
+   X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ),
+                          W[ 6] );
+   X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ),
+                          W[ 7] );
+   X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ),
+                          W[ 8] );
+   X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] );
+   X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] );
+   X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] );
+   X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] );
+   X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] );
+   X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] );
+   X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] );
+
+   A = _mm_load_si128( state_in     );
+   B = _mm_load_si128( state_in + 1 );
+   C = _mm_load_si128( state_in + 2 );
+   D = _mm_load_si128( state_in + 3 );
+   E = _mm_load_si128( state_in + 4 );
+   F = _mm_load_si128( state_in + 5 );
+   G = _mm_load_si128( state_in + 6 );
+   H = _mm_load_si128( state_in + 7 );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   
+   _mm_store_si128( state_mid    , A );
+   _mm_store_si128( state_mid + 1, B );
+   _mm_store_si128( state_mid + 2, C );
+   _mm_store_si128( state_mid + 3, D );
+   _mm_store_si128( state_mid + 4, E );
+   _mm_store_si128( state_mid + 5, F );
+   _mm_store_si128( state_mid + 6, G );
+   _mm_store_si128( state_mid + 7, H );
+}
+
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+          const __m128i *state_in, const __m128i *state_mid, const __m128i *X )
+{
+   __m128i A, B, C, D, E, F, G, H;
+   __m128i W[16];
+
+   memcpy_128( W, data, 16 );
+
+   A = _mm_load_si128( state_mid     );
+   B = _mm_load_si128( state_mid + 1 );
+   C = _mm_load_si128( state_mid + 2 );
+   D = _mm_load_si128( state_mid + 3 );
+   E = _mm_load_si128( state_mid + 4 );
+   F = _mm_load_si128( state_mid + 5 );
+   G = _mm_load_si128( state_mid + 6 );
+   H = _mm_load_si128( state_mid + 7 );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H );
+
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   // update precalculated msg expansion with new nonce: W[3].
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) );
+   W[ 3] = _mm_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) );
+   W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) );
+   W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) );
+   W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) );
+   W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) );
+   W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ),
+                                                W[ 2] ) );
+   W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ),
+                                                W[ 3] ) );
+   W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ),
+                                                W[ 4] ) );
+   W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ),
+                                                W[ 5] ) );
+   W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ),
+                                                W[ 6] ) );
+   W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ),
+                                                W[ 7] ) );
+   W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ),
+                                                W[ 8] ) );
+
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
+
+   A = _mm_add_epi32( A, _mm_load_si128( state_in     ) );
+   B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) );
+   C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) );
+   D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) );
+   E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) );
+   F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) );
+   G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) );
+   H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) );
+
+   _mm_store_si128( state_out    ,  A );
+   _mm_store_si128( state_out + 1,  B );
+   _mm_store_si128( state_out + 2,  C );
+   _mm_store_si128( state_out + 3,  D );
+   _mm_store_si128( state_out + 4,  E );
+   _mm_store_si128( state_out + 5,  F );
+   _mm_store_si128( state_out + 6,  G );
+   _mm_store_si128( state_out + 7,  H );
+}
+
+// returns 0 if hash aborted early and invalid.
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                     const __m128i *state_in )
+{
+   __m128i A, B, C, D, E, F, G, H;
+   __m128i W[16];      memcpy_128( W, data, 16 );
+   // Value required by H after round 60 to produce valid final hash
+   const __m128i H_ = m128_const1_32( 0x136032ED );
+
+   A = _mm_load_si128( state_in   );
+   B = _mm_load_si128( state_in+1 );
+   C = _mm_load_si128( state_in+2 );
+   D = _mm_load_si128( state_in+3 );
+   E = _mm_load_si128( state_in+4 );
+   F = _mm_load_si128( state_in+5 );
+   G = _mm_load_si128( state_in+6 );
+   H = _mm_load_si128( state_in+7 );
+
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x4_MSG_EXPANSION( W );
+   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );
+
+   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   __m128i T1_57 = _mm_add_epi32( G,
+                          mm128_add4_32( BSG2_1( D ), CHs( D, E, F ),
+                          _mm_set1_epi32( K256[57] ), W[ 9] ) );
+   C = _mm_add_epi32( C, T1_57 );
+
+   __m128i T1_58 = _mm_add_epi32( F,
+                          mm128_add4_32( BSG2_1( C ), CHs( C, D, E ),
+                          _mm_set1_epi32( K256[58] ), W[10] ) );
+   B = _mm_add_epi32( B, T1_58 );
+
+   __m128i T1_59 = _mm_add_epi32( E,
+                          mm128_add4_32( BSG2_1( B ), CHs( B, C, D ),
+                          _mm_set1_epi32( K256[59] ), W[11] ) );
+   A = _mm_add_epi32( A, T1_59 );
+
+   __m128i T1_60 = mm128_add4_32( D, BSG2_1( A ), CHs( A, B, C ), W[12] );
+   H = _mm_add_epi32( H, T1_60 );
+
+   if ( _mm_movemask_ps( (__m128)_mm_cmpeq_epi32( H, H_ ) ) == 0 )
+      return 0;
+
+   __m128i K60 = _mm_set1_epi32( K256[60] );
+   H = _mm_add_epi32( H, K60 );
+   
+   G = _mm_add_epi32( T1_57, _mm_add_epi32( BSG2_0( H ),
+                                            MAJs( H, A, B ) ) );
+   F = _mm_add_epi32( T1_58, _mm_add_epi32( BSG2_0( G ),
+                                            MAJs( G, H, A ) ) );
+   E = _mm_add_epi32( T1_59, _mm_add_epi32( BSG2_0( F ),
+                                            MAJs( F, G, H ) ) );
+   D = mm128_add4_32( T1_60, BSG2_0( E ), MAJs( E, F, G ), K60 );
+
+   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
+
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+
+   state_out[0] = _mm_add_epi32( state_in[0], A );
+   state_out[1] = _mm_add_epi32( state_in[1], B );
+   state_out[2] = _mm_add_epi32( state_in[2], C );
+   state_out[3] = _mm_add_epi32( state_in[3], D );
+   state_out[4] = _mm_add_epi32( state_in[4], E );
+   state_out[5] = _mm_add_epi32( state_in[5], F );
+   state_out[6] = _mm_add_epi32( state_in[6], G );
+   state_out[7] = _mm_add_epi32( state_in[7], H );
+   return 1;
+}
+   
 void sha256_4way_init( sha256_4way_context *sc )
 {
    sc->count_high = sc->count_low = 0;
@@ -314,7 +514,26 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
                                        _mm256_srli_epi32( x, 10 ) )
 
 #define SHA2x_MEXP( a, b, c, d ) \
-     mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
+     mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
+
+#define SHA256x8_MSG_EXPANSION( W ) \
+      W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+      W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+      W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+      W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+      W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+      W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+      W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+      W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+      W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+      W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+      W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+      W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+      W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); \
+      W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] ); \
+      W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] ); \
+      W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); 
+
 
 // With AVX512VL ternary logic optimizations are available.
 // If not optimize by forwarding the result of X^Y in MAJ to the next round
@@ -341,6 +560,24 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+
 #else  // AVX2
 
 #define CHx(X, Y, Z) \
@@ -352,6 +589,7 @@ do { \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                          Y_xor_Z ) )
 
+
 #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
   __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -366,10 +604,7 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
-//  the X_xor_y technique can be extended to eliminate the mov instruction.
-//  Perform double rounds and alternate each round. Doesn't apply to AVX512
-//  and isn't suitable for running 3 round prehash.
-//
+
 // read Y_xor_Z, update X_xor_Y
 #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
   _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -404,6 +639,19 @@ do { \
   G  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
+#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+{ \
+   __m256i tic, toc = _mm256_xor_si256( B, C ); \
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j ); \
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j ); \
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j ); \
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j ); \
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j ); \
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); \
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); \
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); \
+}
+
 #endif   // AVX512VL else AVX2
 
 static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
@@ -420,90 +668,12 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
    G = _mm256_load_si256( in+6 );
    H = _mm256_load_si256( in+7 );
 
-#if !defined(__AVX512VL__)
-
-   __m256i tic, toc = _mm256_xor_si256( B, C );
-
-   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, 0 );
-   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, 0 );
-   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, 0 );
-   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, 0 );
-   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, 0 );
-   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 );
-   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 );
-   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 );
-
-#else
-
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-#endif
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
 
    for ( int j = 16; j < 64; j += 16 )
    {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-#if !defined(__AVX512VL__)
-
-      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j );
-      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j );
-      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j );
-      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j );
-      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j );
-      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j );
-      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j );
-      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j );
-
-#else
-      
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-
-#endif      
+      SHA256x8_MSG_EXPANSION( W );
+      SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j );
    }
 
    out[0] = _mm256_add_epi32( in[0], A );
@@ -535,25 +705,36 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
    SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
-void sha256_8way_init( sha256_8way_context *sc )
-{
-   sc->count_high = sc->count_low = 0;
-   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
-   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
-   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
-   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
-   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
-}
-
-// Aggresive prehashing, LE byte order
-void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
-                             const __m256i *state_in )
+// Aggressive prehashing, LE byte order
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                  const __m256i *W, const __m256i *state_in )
 {
    __m256i A, B, C, D, E, F, G, H;
 
+   X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
+                             W[ 2] );
+   X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
+                             SSG2_0x( W[ 4] ) );
+   X[ 4] = _mm256_add_epi32( _mm256_add_epi32( W[13], SSG2_0x( W[ 5] ) ),
+                             W[ 4] );
+   X[ 5] = _mm256_add_epi32( _mm256_add_epi32( W[14], SSG2_0x( W[ 6] ) ),
+                             W[ 5] );
+   X [6] = _mm256_add_epi32( _mm256_add_epi32( W[15], SSG2_0x( W[ 7] ) ),
+                             W[ 6] );
+   X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
+                             W[ 7] );
+   X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
+                             W[ 8] );
+   X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
+   X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
+   X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
+   X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
+   X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
+   X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
+   X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );
+
    A = _mm256_load_si256( state_in     );
    B = _mm256_load_si256( state_in + 1 );
    C = _mm256_load_si256( state_in + 2 );
@@ -582,7 +763,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
 }
 
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-                          const __m256i *state_in, const __m256i *state_mid )
+          const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
 {
    __m256i A, B, C, D, E, F, G, H;
    __m256i W[16];
@@ -620,43 +801,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm256_add_epi32( X[ 2], SSG2_0x( W[ 3] ) );
+   W[ 3] = _mm256_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm256_add_epi32( X[ 4], SSG2_1x( W[ 2] ) );
+   W[ 5] = _mm256_add_epi32( X[ 5], SSG2_1x( W[ 3] ) );
+   W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
+   W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
+   W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
+   W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
+                                                      W[ 2] ) );
+   W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
+                                                      W[ 3] ) );
+   W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
+                                                      W[ 4] ) );
+   W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
+                                                      W[ 5] ) );
+   W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
+                                                      W[ 6] ) );
+   W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
+                                                      W[ 7] ) );
+   W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
+                                                      W[ 8] ) );
 
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x8_MSG_EXPANSION( W );
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x8_MSG_EXPANSION( W );
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
+   
    A = _mm256_add_epi32( A, _mm256_load_si256( state_in     ) );
    B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
    C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
@@ -676,7 +850,136 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
    _mm256_store_si256( state_out + 7,  H );
 }
 
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                                     const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];  memcpy_256( W, data, 16 );
+   const __m256i H_ = m256_const1_32( 0x136032ED );
 
+   A = _mm256_load_si256( state_in   );
+   B = _mm256_load_si256( state_in+1 );
+   C = _mm256_load_si256( state_in+2 );
+   D = _mm256_load_si256( state_in+3 );
+   E = _mm256_load_si256( state_in+4 );
+   F = _mm256_load_si256( state_in+5 );
+   G = _mm256_load_si256( state_in+6 );
+   H = _mm256_load_si256( state_in+7 );
+
+   SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+
+   for ( int j = 16; j < 48; j += 16 )
+   {
+      SHA256x8_MSG_EXPANSION( W );
+      SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j );
+   }
+
+   W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   __m256i T1_57 = _mm256_add_epi32( G,
+                          mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ),
+                          _mm256_set1_epi32( K256[57] ), W[ 9] ) );
+   C = _mm256_add_epi32( C, T1_57 );
+
+   __m256i T1_58 = _mm256_add_epi32( F,  
+                          mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ),
+                          _mm256_set1_epi32( K256[58] ), W[10] ) );
+   B = _mm256_add_epi32( B, T1_58 );
+   
+   __m256i T1_59 = _mm256_add_epi32( E,  
+                          mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ),
+                          _mm256_set1_epi32( K256[59] ), W[11] ) );
+   A = _mm256_add_epi32( A, T1_59 );
+
+   __m256i T1_60 = mm256_add4_32( D, BSG2_1x( A ), CHx( A, B, C ), W[12] );
+   H = _mm256_add_epi32( H, T1_60 );
+
+   if ( _mm256_movemask_ps( (__m256)_mm256_cmpeq_epi32( H, H_ ) ) == 0 )
+      return 0;
+
+   __m256i K60 = _mm256_set1_epi32( K256[60] );
+   H = _mm256_add_epi32( H, K60 );
+
+   G = _mm256_add_epi32( T1_57, _mm256_add_epi32( BSG2_0x( H ),
+                                                  MAJx( H, A, B ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   F = _mm256_add_epi32( T1_58, _mm256_add_epi32( BSG2_0x( G ),
+                                                  MAJx( G, H, A ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   E = _mm256_add_epi32( T1_59, _mm256_add_epi32( BSG2_0x( F ),
+                                                  MAJx( F, G, H ) ) );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   D = mm256_add4_32( T1_60, BSG2_0x( E ), MAJx( E, F, G ), K60 );
+#if !defined(__AVX512VL__)
+   Y_xor_Z = X_xor_Y;
+#endif
+
+   W[13] = SHA2x_MEXP( W[11],  W[6], W[14], W[13] );
+   W[14] = SHA2x_MEXP( W[12],  W[7], W[15], W[14] );
+   W[15] = SHA2x_MEXP( W[13],  W[8], W[ 0], W[15] );
+
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+
+   state_out[0] = _mm256_add_epi32( state_in[0], A );
+   state_out[1] = _mm256_add_epi32( state_in[1], B );
+   state_out[2] = _mm256_add_epi32( state_in[2], C );
+   state_out[3] = _mm256_add_epi32( state_in[3], D );
+   state_out[4] = _mm256_add_epi32( state_in[4], E );
+   state_out[5] = _mm256_add_epi32( state_in[5], F );
+   state_out[6] = _mm256_add_epi32( state_in[6], G );
+   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   return 1;
+}
+
+void sha256_8way_init( sha256_8way_context *sc )
+{
+   sc->count_high = sc->count_low = 0;
+   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+}
 
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
@@ -778,7 +1081,25 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
                                       _mm512_srli_epi32( x, 10 ) )
 
 #define SHA2x16_MEXP( a, b, c, d ) \
-     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
+     mm512_add4_32( SSG2_1x16( a ), b, SSG2_0x16( c ), d );
+
+#define SHA256x16_MSG_EXPANSION( W ) \
+   W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); \
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); \
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); \
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
 
 #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
@@ -806,6 +1127,23 @@ do { \
 } while (0)
 */
 
+#define SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j ); \
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j ); \
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j ); \
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j ); \
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j ); \
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j ); \
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j ); \
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
 
 static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
                                            const  __m512i *in ) \
@@ -820,59 +1158,13 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
    G = _mm512_load_si512( in+6 );
    H = _mm512_load_si512( in+7 );
 
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H,  0 );   
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
 
    out[0] = _mm512_add_epi32( in[0], A );
    out[1] = _mm512_add_epi32( in[1], B );
@@ -903,11 +1195,36 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
    SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
  
-// Aggresive prehashing, LE byte order
-void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
-                             const __m512i *state_in )
+// Aggressive prehashing, LE byte order
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, 
+                                   const __m512i *W, const __m512i *state_in )
 {
    __m512i A, B, C, D, E, F, G, H;
+   
+   // precalculate constant part msg expansion for second iteration.
+   X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
+                             W[ 2] );
+   X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
+                             SSG2_0x16( W[ 4] ) );         
+   X[ 4] = _mm512_add_epi32( _mm512_add_epi32( W[13], SSG2_0x16( W[ 5] ) ),
+                             W[ 4] );
+   X[ 5] = _mm512_add_epi32( _mm512_add_epi32( W[14], SSG2_0x16( W[ 6] ) ),
+                             W[ 5] );
+   X [6] = _mm512_add_epi32( _mm512_add_epi32( W[15], SSG2_0x16( W[ 7] ) ),
+                             W[ 6] ); 
+   X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
+                             W[ 7] );
+   X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
+                             W[ 8] );
+   X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
+   X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
+   X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
+   X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
+   X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
+   X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
+   X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );
 
    A = _mm512_load_si512( state_in     );
    B = _mm512_load_si512( state_in + 1 );
@@ -933,7 +1250,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
 }   
 
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-                          const __m512i *state_in, const __m512i *state_mid )
+          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
    __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
@@ -949,9 +1266,6 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    G = _mm512_load_si512( state_mid + 6 );
    H = _mm512_load_si512( state_mid + 7 );
 
-//   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-//   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-//   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
    SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
    SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
    SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
@@ -966,42 +1280,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+   // update precalculated msg expansion with new nonce: W[3].
+   W[ 0] = X[ 0];
+   W[ 1] = X[ 1];
+   W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
+   W[ 3] = _mm512_add_epi32( X[ 3], W[ 3] );
+   W[ 4] = _mm512_add_epi32( X[ 4], SSG2_1x16( W[ 2] ) );
+   W[ 5] = _mm512_add_epi32( X[ 5], SSG2_1x16( W[ 3] ) );
+   W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
+   W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
+   W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
+   W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
+                                                      W[ 2] ) );
+   W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
+                                                      W[ 3] ) );
+   W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
+                                                      W[ 4] ) );
+   W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
+                                                      W[ 5] ) );
+   W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
+                                                      W[ 6] ) );
+   W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
+                                                      W[ 7] ) );
+   W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
+                                                      W[ 8] ) );
 
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
 
    A = _mm512_add_epi32( A, _mm512_load_si512( state_in     ) );
    B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
@@ -1022,6 +1330,105 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    _mm512_store_si512( state_out + 7,  H );
 }
 
+// returns 0 if hash aborted early and invalid.
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                                     const __m512i *state_in )
+{
+   __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];      memcpy_512( W, data, 16 );
+   // Value for H at round 60, before adding K, to produce valid final hash
+   //where H == 0.
+   // H_ =  -( H256[7] + K256[60] );
+   const __m512i H_ = m512_const1_32( 0x136032ED );
+
+   A = _mm512_load_si512( state_in   );
+   B = _mm512_load_si512( state_in+1 );
+   C = _mm512_load_si512( state_in+2 );
+   D = _mm512_load_si512( state_in+3 );
+   E = _mm512_load_si512( state_in+4 );
+   F = _mm512_load_si512( state_in+5 );
+   G = _mm512_load_si512( state_in+6 );
+   H = _mm512_load_si512( state_in+7 );
+
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
+   SHA256x16_MSG_EXPANSION( W );
+   SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
+
+   W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
+   W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
+   W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
+   W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
+   W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
+   W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
+   W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
+   W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
+   W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
+   W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
+   W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
+   W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
+   W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
+   
+   // Rounds 48 to 56
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 48 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 48 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 48 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 48 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 48 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 48 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
+
+   // Rounds 57 to 60 part 1
+   __m512i T1_57 = _mm512_add_epi32( _mm512_set1_epi32( K256[57] ),
+                  mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
+   C = _mm512_add_epi32( C, T1_57 );
+   __m512i T1_58 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ), 
+                  mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
+   B = _mm512_add_epi32( B, T1_58 );
+   __m512i T1_59 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ), 
+                  mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
+   A = _mm512_add_epi32( A, T1_59 );
+   __m512i T1_60 = mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D );
+   H = _mm512_add_epi32( H, T1_60 );
+
+   // give up?
+   if ( _mm512_cmpeq_epi32_mask( H, H_ ) == 0 ) return 0;   
+
+   // Rounds 57 to 60 part 2
+   __m512i K60 = _mm512_set1_epi32( K256[60] );
+   H = _mm512_add_epi32( H, K60 );
+
+   G = _mm512_add_epi32( T1_57, _mm512_add_epi32( BSG2_0x16( H ),
+                                                  MAJx16( H, A, B ) ) );
+   F = _mm512_add_epi32( T1_58, _mm512_add_epi32( BSG2_0x16( G ),
+                                                  MAJx16( G, H, A ) ) );
+   E = _mm512_add_epi32( T1_59, _mm512_add_epi32( BSG2_0x16( F ),
+                                                  MAJx16( F, G, H ) ) );
+   D = mm512_add4_32( T1_60, BSG2_0x16( E ), MAJx16( E, F, G ), K60 );
+
+   // Rounds 61 to 63
+   W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
+   W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
+   W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
+   
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
+   
+   state_out[0] = _mm512_add_epi32( state_in[0], A );
+   state_out[1] = _mm512_add_epi32( state_in[1], B );
+   state_out[2] = _mm512_add_epi32( state_in[2], C );
+   state_out[3] = _mm512_add_epi32( state_in[3], D );
+   state_out[4] = _mm512_add_epi32( state_in[4], E );
+   state_out[5] = _mm512_add_epi32( state_in[5], F );
+   state_out[6] = _mm512_add_epi32( state_in[6], G );
+   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   return 1;
+}
+  
 void sha256_16way_init( sha256_16way_context *sc )
 {
    sc->count_high = sc->count_low = 0;
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index c69ad58..18eceff 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
    __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -36,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
@@ -49,39 +58,33 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
    sha256_16way_transform_le( midstate1, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
       {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm512_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 16; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_16x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm512_add_epi32( *noncev, sixteen );
-       n += 16;
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -95,13 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
    __m256i  midstate1[8] __attribute__ ((aligned (32)));
    __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -120,6 +124,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -133,35 +145,30 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
    sha256_8way_transform_le( midstate1, vdata, initstate );
    
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm256_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
        }
        *noncev = _mm256_add_epi32( *noncev, eight );
@@ -179,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
+   __m128i  hash32[8]     __attribute__ ((aligned (32)));
+   __m128i  initstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -203,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+
    // initialize state
    initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -214,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
    initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 
    // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
 
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm128_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_4x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm_add_epi32( *noncev, four );
-       n += 4;
+      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h
index 9051ec4..bae0214 100644
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -6,12 +6,10 @@
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   #define SHA256D_16WAY 1
-/*
 #elif defined(__AVX2__)
   #define SHA256D_8WAY 1
 #else
   #define SHA256D_4WAY 1
-*/
 #endif
 
 bool register_sha256d_algo( algo_gate_t* gate );
@@ -21,7 +19,7 @@ bool register_sha256d_algo( algo_gate_t* gate );
 int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-/*
+
 #if defined(SHA256D_8WAY)
 
 int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
@@ -33,7 +31,7 @@ int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-*/
+
 
 /*
 #if defined(__SHA__)
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 9cd3a22..9c1677b 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
    __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -36,7 +37,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                                n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
-   // initialize state
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
    initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
    initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
@@ -49,43 +57,37 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
    sha256_16way_transform_le( midstate1, vdata, initstate );
    
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
 
    do
    {
-      // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );  
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      // 1. final 16 bytes of data, pre-padded
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
+      sha256_16way_transform_le( block, block, initstate );
 
       // 3. 32 byte hash from 2.
-      memcpy_512( block, hash32, 8 );
-      sha256_16way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm512_block_bswap_32( hash32, hash32 );    
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_16way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm512_block_bswap_32( hash32, hash32 );    
+
+         for ( int lane = 0; lane < 16; lane++ )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_16x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm512_add_epi32( *noncev, sixteen );
-       n += 16;
+      }
+      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      n += 16;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -100,13 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
    __m256i  hash32[8]    __attribute__ ((aligned (32)));
    __m256i  initstate[8] __attribute__ ((aligned (32)));
    __m256i  midstate1[8] __attribute__ ((aligned (32)));
    __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -125,6 +128,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -138,43 +149,37 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    sha256_8way_transform_le( midstate1, vdata, initstate );
 
    // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
    
    do
    {
       // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
 
       // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
+      sha256_8way_transform_le( block, block, initstate );
 
       // 3. 32 byte hash from 2.
-      memcpy_256( block, hash32, 8 );
-      sha256_8way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
-      mm256_block_bswap_32( hash32, hash32 );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
       {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         // byte swap final hash for testing
+         mm256_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
          {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
          }
-       }
-       *noncev = _mm256_add_epi32( *noncev, eight );
-       n += 8;
+      }
+      *noncev = _mm256_add_epi32( *noncev, eight );
+      n += 8;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
@@ -183,17 +188,110 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 
 #endif
 
+
 #if defined(SHA256T_4WAY)
 
+// Optimizations are slower with AVX/SSE2
+// https://github.com/JayDDee/cpuminer-opt/issues/344
+/*
+int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
+   __m128i  hash32[8]     __attribute__ ((aligned (32)));
+   __m128i  initstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+
+      // 2. 32 byte hash from 1.
+      sha256_4way_transform_le( block, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
+      {   
+         // byte swap final hash for testing
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+         {
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
+         }
+      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
    __m128i  hash32[8]    __attribute__ ((aligned (32)));
    __m128i  initstate[8] __attribute__ ((aligned (32)));
    __m128i  midstate[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
@@ -212,6 +310,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 
    *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
 
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
    // initialize state
    initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
    initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -227,25 +333,9 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 
    do
    {
-      // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy_128( block, hash32, 8 );
-      sha256_4way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( block,  block,    initstate );
+      sha256_4way_transform_le( hash32, block,    initstate );
       mm128_block_bswap_32( hash32, hash32 );
 
       for ( int lane = 0; lane < 4; lane++ )
@@ -266,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
+
 #endif
 
diff --git a/compat.h b/compat.h
index 124bc40..bd23f9c 100644
--- a/compat.h
+++ b/compat.h
@@ -3,6 +3,10 @@
 
 #ifdef WIN32
 
+#if _WIN32_WINNT==0x0601    // Windows 7
+ #define WINDOWS_CPU_GROUPS_ENABLED 1
+#endif
+
 #include <windows.h>
 #include <time.h>
 
diff --git a/configure b/configure
index 1882597..b93191f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.2'
-PACKAGE_STRING='cpuminer-opt 3.18.2'
+PACKAGE_VERSION='3.18.3'
+PACKAGE_STRING='cpuminer-opt 3.18.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.2
+cpuminer-opt configure 3.18.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.2, which was
+It was created by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.2'
+ VERSION='3.18.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.2, which was
+This file was extended by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.2
+cpuminer-opt config.status 3.18.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index bc5329c..8b80c38 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.2])
+AC_INIT([cpuminer-opt], [3.19.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 061bbb9..179881c 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
  * Copyright 2012-2014 pooler
  * Copyright 2014 Lucas Jones
  * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2020 Jay D Dee
+ * Copyright 2016-2021 Jay D Dee
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -115,22 +115,12 @@ int opt_param_n = 0;
 int opt_param_r = 0;
 int opt_n_threads = 0;
 bool opt_sapling = false;
-
-// Windows doesn't support 128 bit affinity mask.
-// Need compile time and run time test.
-#if defined(__linux) && defined(GCC_INT128)  
-#define AFFINITY_USES_UINT128 1
-static uint128_t opt_affinity = -1;
-static bool affinity_uses_uint128 = true;
-#else
-static uint64_t opt_affinity = -1;
-static bool affinity_uses_uint128 = false;
-#endif
-
+static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
-int num_cpugroups = 1;
-char *rpc_url = NULL;;
+int num_cpugroups = 1;  // For Windows
+#define max_cpus 256   // max for affinity
+char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
 char *short_url = NULL;
@@ -166,6 +156,7 @@ uint32_t accepted_share_count = 0;
 uint32_t rejected_share_count = 0;
 uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
+uint32_t stratum_errors = 0;
 double *thr_hashrates;
 double global_hashrate = 0.;
 double total_hashes = 0.;
@@ -227,18 +218,21 @@ char*  lp_id;
 
 static void   workio_cmd_free(struct workio_cmd *wc);
 
-static void format_affinity_map( char *map_str, uint64_t map )
+// array mapping thread to cpu
+static uint8_t thread_affinity_map[ max_cpus ];
+
+// display affinity mask graphically
+static void format_affinity_mask( char *mask_str, uint64_t mask )
 {
    int n = num_cpus < 64 ? num_cpus : 64;
    int i;
-
    for ( i = 0; i < n; i++ )
    {
-      if ( map & 1 )  map_str[i] = '!';
-      else            map_str[i] = '.';
-      map >>= 1;
+      if ( mask & 1 )  mask_str[i] = '!';
+      else             mask_str[i] = '.';
+      mask >>= 1;
    }
-   memset( &map_str[i], 0, 64 - i );
+   memset( &mask_str[i], 0, 64 - i );
 }
 
 #ifdef __linux /* Linux specific policy and affinity management */
@@ -260,93 +254,70 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif
 
-// Linux affinity can use int128.
-#if AFFINITY_USES_UINT128
-static void affine_to_cpu_mask( int id, uint128_t mask )
-#else
-static void affine_to_cpu_mask( int id, uint64_t mask )
-#endif
+static void affine_to_cpu( struct thr_info *thr )
 {
+   int thread = thr->id;
    cpu_set_t set;
    CPU_ZERO( &set );
-   uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;       
-
-   for ( uint8_t i = 0; i < ncpus; i++ ) 
-   {
-      // cpu mask
-#if AFFINITY_USES_UINT128
-      if( ( mask & ( (uint128_t)1 << i ) ) )  CPU_SET( i, &set );
-#else
-      if( (ncpus > 64) || ( mask & (1 << i) ) )  CPU_SET( i, &set );
-#endif
-   }
-   if ( id == -1 )
-   {
-      // process affinity
-      sched_setaffinity(0, sizeof(&set), &set);
-   }
-   else
-   {
-      // thread only
-      pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
-   }
+   CPU_SET( thread_affinity_map[ thread ], &set );
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d",
+                        thread, thread_affinity_map[ thread ] );
+   pthread_setaffinity_np( thr->pth, sizeof(set), &set );
 }
 
 #elif defined(WIN32) /* Windows */
+
 static inline void drop_policy(void) { }
 
 // Windows CPU groups to manage more than 64 CPUs.
-static void affine_to_cpu_mask( int id, uint64_t mask )
+// mask arg is ignored
+static void affine_to_cpu( struct thr_info *thr )
 {
-   bool success;
+   int thread = thr->id;
    unsigned long last_error;    
-//   BOOL success;
-//   DWORD last_error;
+   bool ok;
 
-   if ( id == -1 )
-      success = SetProcessAffinityMask( GetCurrentProcess(), mask );
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   unsigned long group_size = GetActiveProcessorCount( 0 );
+   unsigned long group      = thread / group_size;
+   unsigned long cpu        = thread_affinity_map[ thread % group_size ];
 
-// Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
-   else if ( num_cpugroups == 1 )
-	   success = SetThreadAffinityMask( GetCurrentThread(), mask );
-   else
-   {
-	   // Find the correct cpu group
-	   int cpu = id % num_cpus;
-	   int group;
-	   for( group = 0; group < num_cpugroups; group++ )
-	   {
-	      int cpus = GetActiveProcessorCount( group );
- 	      if ( cpu < cpus )  break;
-  	      cpu -= cpus;
-      }
+   GROUP_AFFINITY affinity;
+   affinity.Group = group;
+   affinity.Mask = 1ULL << cpu;
 
-	   if (opt_debug)
-         applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)",
-               id, cpu, group, (1ULL << cpu));
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d in cpu group %d",
+                        thread, cpu, group );
+
+   ok = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
 
-	   GROUP_AFFINITY affinity;
-	   affinity.Group = group;
-	   affinity.Mask = 1ULL << cpu;
-	   success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
-   }
 #else
-   else 
-      success = SetThreadAffinityMask( GetCurrentThread(), mask );
+
+   unsigned long cpu = thread_affinity_map[ thread ];
+   uint64_t mask = 1ULL << cpu;
+
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d", thread, cpu );
+
+   ok = SetThreadAffinityMask( GetCurrentThread(), mask );
+
 #endif
 
-   if (!success)
+   if ( !ok )
    {
-	   last_error = GetLastError();
-	   applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x",
-               id, last_error);
+      last_error = GetLastError();
+      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
+                           thread, last_error );
    }
-}
+}   
 
 #else
+
 static inline void drop_policy(void) { }
-static void affine_to_cpu_mask(int id, unsigned long mask) { }
+static void affine_to_cpu( struct thr_info *thr ) { }
+
 #endif
 
 // not very useful, just index the arrray directly.
@@ -1159,17 +1130,23 @@ void report_summary_log( bool force )
       applog2( prio, "Blocks Solved   %7d      %7d",
                solved, solved_block_count );
    }
+   if ( stratum_errors )
+      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+
    applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
             highest_share, lowest_share );
 
    int mismatch = submitted_share_count
          - ( accepted_share_count + stale_share_count + rejected_share_count );
+
    if ( mismatch )
    {
-      if ( mismatch != 1 )
-         applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
-      else
-         applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
+      if ( stratum_errors )
+         applog2( LOG_MINR, "Count mismatch: %d, stats may be inaccurate",
+                            mismatch );
+      else if ( !opt_quiet )
+         applog2( LOG_INFO, CL_LBL
+                  "Count mismatch, submitted share may still be pending" CL_N );
    }
 }
 
@@ -2241,49 +2218,9 @@ static void *miner_thread( void *userdata )
 	   if ( opt_priority == 0 )
 	      drop_policy();
    }
+
    // CPU thread affinity
-   if ( num_cpus > 1 )
-   {
-#if AFFINITY_USES_UINT128
-      // Default affinity
-      if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 )
-      {  
-         affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) );
-         if ( opt_debug )
-            applog( LOG_INFO, "Binding thread %d to cpu %d.",
-                    thr_id, thr_id % num_cpus,
-	                 u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ),
-		              u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) );
-      }
-#else
-      if ( ( opt_affinity == -1 ) && ( opt_n_threads > 1 ) ) 
-      {
-         affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) );
-         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
-                thr_id, thr_id % num_cpus, 1 << (thr_id % num_cpus)) ;
-      }
-#endif
-      else   // Custom affinity
-      {
-         affine_to_cpu_mask( thr_id, opt_affinity );
-         if ( opt_debug )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog( LOG_INFO, "Binding thread %d to mask %016llx %016llx",
-                                thr_id, u128_hi64( opt_affinity ), 
-                                        u128_lo64( opt_affinity ) );
-            else
-               applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#else
-            applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#endif
-         }
-      }
-   }  // num_cpus > 1
+   if ( opt_affinity && num_cpus > 1 )   affine_to_cpu( mythr );
 
    if ( !algo_gate.miner_thread_init( thr_id ) )
    {
@@ -2792,6 +2729,7 @@ static void *stratum_thread(void *userdata )
       {
           stratum_need_reset = false;
           stratum_down = true;
+          stratum_errors++;
           stratum_disconnect( &stratum );
           if ( strcmp( stratum.url, rpc_url ) )
           {
@@ -2809,6 +2747,7 @@ static void *stratum_thread(void *userdata )
       while ( !stratum.curl )
       {
          stratum_down = true;
+         restart_threads();
          pthread_rwlock_wrlock( &g_work_lock );
          g_work_time = 0;
          pthread_rwlock_unlock( &g_work_lock );
@@ -2830,7 +2769,6 @@ static void *stratum_thread(void *userdata )
          else
          {
             stratum_down = false;
-            restart_threads();
             applog(LOG_BLUE,"Stratum connection established" );
          }
       }
@@ -3137,7 +3075,7 @@ void parse_arg(int key, char *arg )
 {
 	char *p;
 	int v, i;
-	uint64_t ul;
+//	uint64_t ul;
 	double d;
 
 	switch( key )
@@ -3448,21 +3386,10 @@ void parse_arg(int key, char *arg )
 		break;
 #endif
 	case 1020:  // cpu-affinity
-		p = strstr(arg, "0x");
-		if ( p )
-			ul = strtoull( p, NULL, 16 );
-		else
-			ul = atoll( arg );
-#if AFFINITY_USES_UINT128
-// replicate the low 64 bits to make a full 128 bit mask if there are more
-// than 64 CPUs, otherwise zero extend the upper half.
-         opt_affinity = (uint128_t)ul;
-         if ( num_cpus > 64 )
-            opt_affinity |= opt_affinity << 64;
-#else
-         opt_affinity = ul;
-#endif
-		break;
+      p = strstr( arg, "0x" );
+      opt_affinity = p ? strtoull( p, NULL, 16 )
+                       : atoll( arg );
+      break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
 		if (v < 0 || v > 5)	/* sanity check */
@@ -3565,20 +3492,18 @@ static void parse_cmdline(int argc, char *argv[])
    while (1)
    {
 #if HAVE_GETOPT_LONG
-	key = getopt_long(argc, argv, short_options, options, NULL);
+      key = getopt_long(argc, argv, short_options, options, NULL);
 #else
-	key = getopt(argc, argv, short_options);
+      key = getopt(argc, argv, short_options);
 #endif
-	if (key < 0)
-		break;
-
-	parse_arg(key, optarg);
+      if ( key < 0 )   break;
+      parse_arg( key, optarg );
    }
-   if (optind < argc)
+   if ( optind < argc )
    {
-	fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n",
-		argv[0], argv[optind]);
-        show_usage_and_exit(1);
+      fprintf( stderr, "%s: unsupported non-option argument -- '%s'\n",
+		                 argv[0], argv[optind]);
+      show_usage_and_exit(1);
    }
 }
 
@@ -3642,26 +3567,21 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 
-   parse_cmdline(argc, argv);
-
 #if defined(WIN32)
-//	SYSTEM_INFO sysinfo;
-//	GetSystemInfo(&sysinfo);
-//	num_cpus = sysinfo.dwNumberOfProcessors;
-// What happens if GetActiveProcessorGroupCount called if groups not enabled?
 
 // Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
  	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
-	for(  i = 0; i < num_cpugroups; i++ )
+	for( i = 0; i < num_cpugroups; i++ )
 	{
- 	   int cpus = GetActiveProcessorCount(i);
+ 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;
 
 	   if (opt_debug)
-         applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i);
+         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}
+
 #else
    SYSTEM_INFO sysinfo;
    GetSystemInfo(&sysinfo);
@@ -3677,21 +3597,20 @@ int main(int argc, char *argv[])
 #else
 	num_cpus = 1;
 #endif
-	if (num_cpus < 1)
-		num_cpus = 1;
 
-   if (!opt_n_threads)
-      opt_n_threads = num_cpus;
+   if ( num_cpus < 1 )    num_cpus = 1;
+
+   parse_cmdline( argc, argv );
 
    if ( opt_algo == ALGO_NULL )
    {
-      fprintf(stderr, "%s: no algo supplied\n", argv[0]);
+      fprintf( stderr, "%s: No algo parameter specified\n", argv[0] );
       show_usage_and_exit(1);
    }
 
    // need to register to get algo optimizations for cpu capabilities
-   // but that causes register logs before cpu capabilities is output.
-   // Would need to split register into 2 parts. First part sets algo
+   // but that causes registration logs before cpu capabilities is output.
+   // Would need to split register function into 2 parts. First part sets algo
    // optimizations but no logging, second part does any logging.   
    if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
 
@@ -3735,9 +3654,6 @@ int main(int argc, char *argv[])
          return 1;
 	}
 
-   // All options must be set before starting the gate
-//   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
-
    if ( coinbase_address )
    {
       pk_script_size = address_to_script( pk_script, pk_buffer_size,
@@ -3749,8 +3665,6 @@ int main(int argc, char *argv[])
       }
    }
 
-//   if ( !check_cpu_capability() ) exit(1);
-
 	pthread_mutex_init( &stats_lock, NULL );
    pthread_rwlock_init( &g_work_lock, NULL );
 	pthread_mutex_init( &stratum.sock_lock, NULL );
@@ -3820,44 +3734,31 @@ int main(int argc, char *argv[])
 	}
 #endif
 
-// To be confirmed with more than 64 cpus
-   if ( opt_affinity != -1 )
-   {
-      if ( !affinity_uses_uint128 && num_cpus > 64 )
-      {
-          applog(LOG_WARNING,"Setting CPU affinity with more than 64 CPUs is only");
-          applog(LOG_WARNING,"available on Linux. Using default affinity.");
-          opt_affinity = -1;
-      }
-/*
-      else	
-      {
-         affine_to_cpu_mask( -1, opt_affinity );
-         if ( !opt_quiet )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) );
-            else 
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#else
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#endif
-         }
-      }
-*/
-   }
+   if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) )
+      opt_n_threads = num_cpus;
 
-   if ( !opt_quiet && ( opt_n_threads < num_cpus ) )
+   if ( opt_affinity && num_cpus > max_cpus )
    {
-      char affinity_map[64];
-      format_affinity_map( affinity_map, opt_affinity );
-      applog( LOG_INFO, "CPU affinity [%s]", affinity_map );
+      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
+                            max_cpus );
+      opt_affinity = 0ULL;
    }
    
+   if ( opt_affinity )
+   {
+      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      {
+         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         thread_affinity_map[ thr ] = cpu % num_cpus;
+      }
+      if ( !opt_quiet )
+      {
+         char affinity_mask[64];
+         format_affinity_mask( affinity_mask, opt_affinity );
+         applog( LOG_INFO, "CPU affinity [%s]", affinity_mask );
+      }
+   }
+    
 #ifdef HAVE_SYSLOG_H
 	if (use_syslog)
 		openlog("cpuminer", LOG_PID, LOG_USER);
@@ -3955,7 +3856,7 @@ int main(int argc, char *argv[])
 			return 1;
 		}
       if ( !opt_quiet )
-         applog( LOG_INFO,"API listnening to %s:%d", opt_api_allow,
+         applog( LOG_INFO,"API listening to %s:%d", opt_api_allow,
                                                      opt_api_listen );
    }
 
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 4953cec..71e4298 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,13 +16,13 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+# support for Windows CPU groups
+export DEFAULT_CFLAGS="-O3 -Wall -D_WIN32_WINNT=0x0601"
+#export DEFAULT_CFLAGS="-O3 -Wall"
 
 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 
-# edit configure to fix pthread lib name for Windows.
-#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
 # make release directory and copy selected DLLs.
 
 rm -rf release > /dev/null
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=icelake-client -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,8 +53,8 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # Rocketlake AVX512 SHA AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS
-#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=cascadelake -msha" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=rocketlake" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha.exe
@@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha.exe
 # Zen1 AVX2 AES SHA
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
@@ -70,8 +70,8 @@ mv cpuminer.exe release/cpuminer-zen.exe
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
-# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver2 -mvaes" ./configure $CONFIGURE_ARGS
+# CFLAGS="$DEFAULT_CFLAGS -march=znver3" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen3.exe
@@ -80,7 +80,7 @@ mv cpuminer.exe release/cpuminer-zen3.exe
 # mingw won't compile avx512 without -fno-asynchronous-unwind-tables
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -90,7 +90,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES in -march=core-avx2
-CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=core-avx2 -maes" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -99,7 +99,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
 make clean || echo clean
 rm -f config.status
 # -march=corei7-avx still includes aes, but just in case
-CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS 
+CFLAGS="$DEFAULT_CFLAGS -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=westmere -maes" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -116,7 +116,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Nehalem SSE4.2
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=corei7" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
@@ -124,7 +124,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Core2 SSSE3
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=core2" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
@@ -133,7 +133,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -msse2" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe