v3.18.2

2025-09-17 23:44:27 +00:00 · 2021-10-19 22:35:36 -04:00
18 changed files with 474 additions and 189 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha256d.c \
  algo/sha/sha2.c \
  algo/sha/sha256d-4way.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
  algo/sha/sha256t.c \
--- a/14
+++ b/14
@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v3.8.2
 Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
 AVX512 for sha256d.
 SSE42 and AVX may now be displayed as mining features at startup.
 This is hard coded for each algo, and is only implemented for scrypt
 at this time as it is the only algo with significant performance differences
 with those features.
 Fixed an issue where a high hashrate algo could cause excessive invalid hash
 rate log reports when starting up in benchmark mode.
 v3.18.1
 More speed for scrypt:
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -337,42 +337,42 @@ do{ \
   XC2 = XOR( XC2, TC ); \
 \
   TA = ADD32( XA2, XA1 ); \
   XA1 = ROL_1X32( XA1 ); \
   TB = ADD32( XB2, XB1 ); \
   TC = ADD32( XC2, XC1 ); \
   TA = ROL32( TA, 13 ); \
   XA1 = ROL_1X32( XA1 ); \
   XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
+   TA = ROL32( TA, 13 ); \
   XA3 = XOR( XA3, TA ); \
   XC1 = ROL_1X32( XC1 ); \
   TB = ROL32( TB, 13 ); \
   XB3 = XOR( XB3, TB ); \
   TC = ROL32( TC, 13 ); \
   XC3 = XOR( XC3, TC ); \
 \
   TA = ADD32( XA3, XA2 ); \
   XA2 = SWAP_64( XA2 ); \
   TB = ADD32( XB3, XB2 ); \
   TC = ADD32( XC3, XC2 ); \
   TA = ROL32( TA, 18 ); \
   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
   XC2 = SWAP_64( XC2 ); \
   XA0 = XOR( XA0, TA ); \
   TB = ROL32( TB, 18 ); \
   XB0 = XOR( XB0, TB ); \
   XC2 = SWAP_64( XC2 ); \
   TC = ROL32( TC, 18 ); \
   XC0 = XOR( XC0, TC ); \
 \
   TA = ADD32( XA0, XA1 ); \
   XA3 = ROR_1X32( XA3 ); \
   TB = ADD32( XB0, XB1 ); \
   TC = ADD32( XC0, XC1 ); \
   TA = ROL32( TA, 7 ); \
-   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
   XA3 = XOR( XA3, TA ); \
   TB = ROL32( TB, 7 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
   XB3 = XOR( XB3, TB ); \
   TC = ROL32( TC, 7 ); \
   XC3 = ROR_1X32( XC3 ); \
   XC3 = XOR( XC3, TC ); \
 \
   TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
   XC1 = XOR( XC1, TC ); \
 \
   TA = ADD32( XA1, XA2 ); \
   XA2 = SWAP_64( XA2 ); \
   TB = ADD32( XB1, XB2 ); \
   XB2 = SWAP_64( XB2 ); \
   TA = ROL32( TA, 18); \
   TC = ADD32( XC1, XC2 ); \
-   XA2 = SWAP_64( XA2 ); \
+   XC2 = SWAP_64( XC2 ); \
   TB = ROL32( TB, 18); \
   XA0 = XOR( XA0, TA ); \
-   XB2 = SWAP_64( XB2 ); \
+   XA1 = ROR_1X32( XA1 ); \
   TC = ROL32( TC, 18); \
   XB0 = XOR( XB0, TB ); \
   XC2 = SWAP_64( XC2 ); \
   XA1 = ROR_1X32( XA1 ); \
   XB1 = ROR_1X32( XB1 ); \
   XC0 = XOR( XC0, TC ); \
   XC1 = ROR_1X32( XC1 ); \
 } while (0);
-// slow rol, an attempt to optimze non-avx512 bit rotations
+// slow rot, an attempt to optimze non-avx512 bit rotations
 // Contains target specific instructions, only for use with 128 bit vectors
 #define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -28,7 +28,6 @@
 */
 #include "algo-gate-api.h"
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
  0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
-static int scrypt_throughput = 0;
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define SCRYPT_THROUGHPUT 16
 #elif defined(__AVX2__)
 #define SCRYPT_THROUGHPUT 8
 #else
 #define SCRYPT_THROUGHPUT 4
 #endif
 // static int scrypt_throughput = 0;
 static int scratchbuf_size = 0;
-static __thread char *scratchbuf = NULL;
+static __thread uint32_t *scratchbuf = NULL;
 // change this to a constant to be used directly  as input state arg
 // vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
 #endif // AVX512
 //#if defined(USE_ASM) && defined(__x86_64__)
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
 //int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V, int N);
 void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
 //#if defined(USE_AVX2)
 #if defined(__AVX2__)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
 //#define scrypt_best_throughput() 1
 #endif
 #include "scrypt-core-4way.h"
-static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
+/*
-	uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
+static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
                              uint32_t *midstate, int N, int thr_id )
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
 	uint32_t *V = (uint32_t*)scratchpad;
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-   scrypt_core_simd128( X, V, N );  // woring
+   scrypt_core_simd128( X, scratchbuf, N );  // woring
 //   scrypt_core_1way( X, V, N );  // working
 //   scrypt_core(X, V, N);
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
   return true;
 }
 */
-#if defined(__AVX2__)
+#if ( SCRYPT_THROUGHPUT == 8 )
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                  uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 8*8 ];
   uint32_t _ALIGN(128) ostate[ 8*8 ];
   uint32_t _ALIGN(128) W[ 8*32 ];
   uint32_t _ALIGN(128) X[ 8*32 ];
   uint32_t *V = (uint32_t*)scratchpad;
   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
                   input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   if ( opt_param_n > 0x4000 )
   {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
   }
   else
   {
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
      intrlv_2x128( W+128, X+128, X+160, 1024 );
      intrlv_2x128( W+192, X+192, X+224, 1024 );
-      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
      dintrlv_2x128( X,     X+ 32, W,     1024 );
      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
      dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
 #endif  // AVX2
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if ( SCRYPT_THROUGHPUT == 16 )
 static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                   uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 16*8 ];
   uint32_t _ALIGN(128) ostate[ 16*8 ];
   uint32_t _ALIGN(128) W[ 16*32 ]; 
   uint32_t _ALIGN(128) X[ 16*32 ];
   uint32_t *V = (uint32_t*)scratchpad;
   intrlv_16x32( W, input,     input+ 20, input+ 40, input+ 60,
                    input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( opt_param_n > 0x4000 )
   {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+256, V, N );
+      scrypt_core_simd128_3buf( X+256, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+352, V, N );
+      scrypt_core_simd128_3buf( X+352, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+448, V, N );
+      scrypt_core_simd128_2buf( X+448, scratchbuf, N );
   }
   else
   {
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
-      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 #endif // AVX512
-#if defined(__SHA__)
+#if 0
 static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                      uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 2*8 ];
    uint32_t _ALIGN(128) ostate[ 2*8 ];
    uint32_t _ALIGN(128) W[ 2*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
    memcpy( tstate,    midstate, 32 );
    memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
                                   input, input+20,  W, W+32 );
-    scrypt_core_simd128_2buf( W, V, N );
+    scrypt_core_simd128_2buf( W, scratchbuf, N );
    if ( work_restart[thrid].restart ) return 0;
    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
 }
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[4 * 8];
    uint32_t _ALIGN(128) ostate[4 * 8];
    uint32_t _ALIGN(128) W[4 * 32];
    uint32_t *V = (uint32_t*)scratchpad;
    memcpy( tstate,    midstate, 32 );
    memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 */
   // working, double buffered linear simd
-   scrypt_core_simd128_2buf( W, V, N );
+   scrypt_core_simd128_2buf( W, scratchbuf, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( W+64, V, N );
+   scrypt_core_simd128_2buf( W+64, scratchbuf, N );
 /*
   scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
   return 1;
 }
 #endif
-#else
+#if ( SCRYPT_THROUGHPUT == 4 )
 #ifdef HAVE_SHA256_4WAY
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 4*8 ];
   uint32_t _ALIGN(128) ostate[ 4*8 ];
   uint32_t _ALIGN(128) W[ 4*32 ];
   uint32_t *V = (uint32_t*)scratchpad;
   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
   for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   {
      uint32_t _ALIGN(128) X[ 4*32 ];
      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
-      scrypt_core_simd128_2buf( X, V, N );
+      scrypt_core_simd128_2buf( X, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+64, V, N );
+      scrypt_core_simd128_2buf( X+64, scratchbuf, N );
      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
   }
   else
-      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+      scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   return 1;
 }
-#endif /* HAVE_SHA256_4WAY */
+#endif   // SCRYPT_THROUGHPUT == 4
-#endif // SHA
+//#endif // SHA
 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) hash[  8*SCRYPT_THROUGHPUT ];
   uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
   uint32_t midstate[8];
   uint32_t n = pdata[19] - 1;
   int thr_id = mythr->id;  
   int throughput = scrypt_throughput;
   int i;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   for ( i = 0; i < throughput; i++ )
+   for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
      memcpy( data + i * 20, pdata, 80 );
   sha256_transform_le( midstate, data, sha256_initial_state );
   do {
      bool rc = true;
-      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-      if ( throughput == 16 )
+#if ( SCRYPT_THROUGHPUT == 16 )
-         rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
+//      if ( SCRYPT_THROUGHPUT == 16 )
-                                      opt_param_n, thr_id );
+         rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
-      else
+                                      thr_id );
-#endif
+//      else
-#if defined(__AVX2__)      
+//#endif
-      if ( throughput == 8 )      
+//#if defined(__AVX2__)      
-         rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
+#elif ( SCRYPT_THROUGHPUT == 8 )
-                                     opt_param_n, thr_id );
+//         if ( SCRYPT_THROUGHPUT == 8 )      
-      else
+         rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
-#endif
+                                     thr_id );
-      if ( throughput == 4 ) // slower on Ryzen than 8way
+//      else
-#if defined(__SHA__)
+//#endif
-         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
+#elif ( SCRYPT_THROUGHPUT == 4 )
-                                         opt_param_n, thr_id );
+//      if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
 //#if defined(__SHA__)
 //         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
 //                                         thr_id );
 //#else
         rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
                                     thr_id );
 #else
-         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
+
-                                     opt_param_n, thr_id );
+#error "Invalid SCRYPT_THROUGHPUT"
 #endif
 /*
 #if defined(__SHA__)
      else
-      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
+      if ( SCRYPT_THROUGHPUT == 2 )  // slower on Ryzen than 4way_sha & 8way
-         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
-                                         opt_param_n, thr_id );
+                                         thr_id );
 #endif         
      else  // should never get here
-         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
+         rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
-                                opt_param_n, thr_id );
+*/
      // test the hash
      if ( rc )
-      for ( i = 0; i < throughput; i++ )
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
      {
         if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
         {
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
      }
-   } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
+   } while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
 	*hashes_done = n - pdata[19];
 	pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 //#if defined(__SHA__)
 //   gate->optimizations = SSE2_OPT | SHA_OPT;
 //#else
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
 //#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
   opt_param_n = opt_param_n ? opt_param_n : 1024;
   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
 // scrypt_throughput can be defined at compile time and used to replace
 // MAX_WAYS to reduce memory usage.
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   scrypt_throughput = 16;
+//   scrypt_throughput = 16;
   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else      
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
 */
 #elif defined(__AVX2__)
-   scrypt_throughput = 8;   
+//   scrypt_throughput = 8;   
   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else
      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
-   scrypt_throughput = 4;
+//   scrypt_throughput = 4;
   if ( opt_param_n > 0x4000 )
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
   else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
   format_number_si( &d_size, d_units );
   applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
-          scrypt_throughput, t_size, t_units, d_size, d_units );
+          SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );
   return true;
 };
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in );
 void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
                             const __m256i *state_in );
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
                          const __m256i *state_in, const __m256i *state_mid );
 #endif  // AVX2
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -8,7 +8,7 @@
 * any later version.  See COPYING for more details.
 */
-#include "algo-gate-api.h"
+#include "sha256d-4way.h"
 #include <string.h>
 #include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
 };
 // this performs the entire hash all over again, why?
 // because main function only does 56 rounds.
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
-static inline int scanhash_sha256d_4way( struct work *work,
+static inline int scanhash_sha256d_4way_pooler( struct work *work,
             uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
 void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
-static inline int scanhash_sha256d_8way( struct work *work,
+static inline int scanhash_sha256d_8way_pooler( struct work *work,
            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,
 #endif /* HAVE_SHA256_8WAY */
-int scanhash_sha256d( struct work *work,
+int scanhash_sha256d_pooler( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,
 #ifdef HAVE_SHA256_8WAY
 	if (sha256_use_8way())
-		return scanhash_sha256d_8way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_8way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
-		return scanhash_sha256d_4way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_4way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 	memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-   gate->scanhash = (void*)&scanhash_sha256d;
+#if defined(SHA256D_16WAY)
-//   gate->hash     = (void*)&sha256d;
+   gate->scanhash = (void*)&scanhash_sha256d_16way;
 #else
   gate->scanhash = (void*)&scanhash_sha256d_pooler;
 #endif
   //   gate->hash     = (void*)&sha256d;
   return true;
 };
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 }
 // Aggresive prehashing, LE byte order
 void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
                             const __m256i *state_in )
 {
   __m256i A, B, C, D, E, F, G, H;
   A = _mm256_load_si256( state_in     );
   B = _mm256_load_si256( state_in + 1 );
   C = _mm256_load_si256( state_in + 2 );
   D = _mm256_load_si256( state_in + 3 );
   E = _mm256_load_si256( state_in + 4 );
   F = _mm256_load_si256( state_in + 5 );
   G = _mm256_load_si256( state_in + 6 );
   H = _mm256_load_si256( state_in + 7 );
 #if !defined(__AVX512VL__)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
 #endif
   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   _mm256_store_si256( state_mid    , A );
   _mm256_store_si256( state_mid + 1, B );
   _mm256_store_si256( state_mid + 2, C );
   _mm256_store_si256( state_mid + 3, D );
   _mm256_store_si256( state_mid + 4, E );
   _mm256_store_si256( state_mid + 5, F );
   _mm256_store_si256( state_mid + 6, G );
   _mm256_store_si256( state_mid + 7, H );
 }
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
                          const __m256i *state_in, const __m256i *state_mid )
 {
   __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];
   memcpy_256( W, data, 16 );
   A = _mm256_load_si256( state_mid     );
   B = _mm256_load_si256( state_mid + 1 );
   C = _mm256_load_si256( state_mid + 2 );
   D = _mm256_load_si256( state_mid + 3 );
   E = _mm256_load_si256( state_mid + 4 );
   F = _mm256_load_si256( state_mid + 5 );
   G = _mm256_load_si256( state_mid + 6 );
   H = _mm256_load_si256( state_mid + 7 );
 //   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
 //   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
 //   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
 #if !defined(__AVX512VL__)
   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
 #endif
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }
   A = _mm256_add_epi32( A, _mm256_load_si256( state_in     ) );
   B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
   C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
   D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
   E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
   F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
   G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
   H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
   _mm256_store_si256( state_out    ,  A );
   _mm256_store_si256( state_out + 1,  B );
   _mm256_store_si256( state_out + 2,  C );
   _mm256_store_si256( state_out + 3,  D );
   _mm256_store_si256( state_out + 4,  E );
   _mm256_store_si256( state_out + 5,  F );
   _mm256_store_si256( state_out + 6,  G );
   _mm256_store_si256( state_out + 7,  H );
 }
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
 #define sha256_transform_be sph_sha256_transform_be
 #endif
 // SHA can't do only 3 rounds
 #define sha256_prehash_3rounds sph_sha256_prehash_3rounds
 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -1,4 +1,4 @@
-#include "sha256t-gate.h"
+#include "sha256d-4way.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   __m512i  block[16]    __attribute__ ((aligned (64)));
   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
-   // hash first 64 byte block of data
+   sha256_16way_transform_le( midstate1, vdata, initstate );
   sha256_16way_transform_le( midstate, vdata, initstate );
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
   do
   {
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_512( block + 5, 10 );
      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
      // 2. 32 byte hash from 1.
      memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i  block[16]    __attribute__ ((aligned (64)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
   __m256i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -116,7 +116,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const __m256i eight = m256_const1_32( 8 );
   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = m256_const1_32( pdata[i] );
   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   // hash first 64 bytes of data
+   sha256_8way_transform_le( midstate1, vdata, initstate );
-   sha256_8way_transform_le( midstate, vdata, initstate );
+   
   // Do 3 rounds on the first 12 bytes of the next block
   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
   do
   {
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_256( block + 5, 10 );
      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
      // 2. 32 byte hash from 1.
      memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 #endif
 /*
 bool register_sha256d_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
 #elif defined(SHA256D_8WAY)
   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #elif defined(SHA256D_4WAY)
   gate->scanhash = (void*)&scanhash_sha256d_4way;
 #endif
 //   gate->hash     = (void*)&sha256d;
   return true;
 };
 */
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -0,0 +1,48 @@
 #ifndef __SHA256D_4WAY_H__
 #define __SHA256D_4WAY_H__ 1
 #include <stdint.h>
 #include "algo-gate-api.h"
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256D_16WAY 1
 /*
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
 #else
  #define SHA256D_4WAY 1
 */
 #endif
 bool register_sha256d_algo( algo_gate_t* gate );
 #if defined(SHA256D_16WAY)
 int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 /*
 #if defined(SHA256D_8WAY)
 int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 #if defined(SHA256D_4WAY)
 int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 */
 /*
 #if defined(__SHA__)
 int scanhash_sha256d( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 */
 #endif
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   __m512i  block[16]    __attribute__ ((aligned (64)));
   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -31,7 +31,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   const __m512i sixteen = m512_const1_32( 16 );
   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m512_const1_32( pdata[i] );
+      vdata[i] = m512_const1_32( pdata[i] );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
-   // hash first 64 byte block of data
+   sha256_16way_transform_le( midstate1, vdata, initstate );
   sha256_16way_transform_le( midstate, vdata, initstate );
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
   do
   {
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_512( block + 5, 10 );  
      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
      // 2. 32 byte hash from 1.
      memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   __m256i  block[16]    __attribute__ ((aligned (64)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
   __m256i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -121,7 +121,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   const __m256i eight = m256_const1_32( 8 );
   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = m256_const1_32( pdata[i] );
   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
@@ -135,8 +135,10 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   // hash first 64 bytes of data
+   sha256_8way_transform_le( midstate1, vdata, initstate );
-   sha256_8way_transform_le( midstate, vdata, initstate );
+
   // Do 3 rounds on the first 12 bytes of the next block
   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
   do
   {
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_256( block + 5, 10 );
      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
      // 2. 32 byte hash from 1.
      memcpy_256( block, hash32, 8 );
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );
 }
 void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
                                 const uint32_t *state_in )
 {
   uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
   memcpy( state_out, state_in, 32 );
   t1 = state_out[7] + BSG2_1( state_out[4] )
      + CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
   t2 = BSG2_0( state_out[0] )
      + MAJ( state_out[0], state_out[1], state_out[2] );
   Y_xor_Z = X_xor_Y;
   state_out[3] += t1;
   state_out[7] = t1 + t2;
   t1 = state_out[6] + BSG2_1( state_out[3] ) 
      + CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
   t2 = BSG2_0( state_out[7] )
      + MAJ( state_out[7], state_out[0], state_out[1] );
   Y_xor_Z = X_xor_Y;
   state_out[2] += t1;
   state_out[6] = t1 + t2;
   t1 = state_out[5] + BSG2_1( state_out[2] )
      + CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
   t2 = BSG2_0( state_out[6] )
      + MAJ( state_out[6], state_out[7], state_out[0] );
   state_out[1] += t1;
   state_out[5] = t1 + t2;
 }   
 /* see sph_sha2.h */
 void
 sph_sha224_init(void *cc)
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
 void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
                              const uint32_t *state_in );
 void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
                                 const uint32_t *state_in );
 #if SPH_64
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.1'
+PACKAGE_VERSION='3.18.2'
-PACKAGE_STRING='cpuminer-opt 3.18.1'
+PACKAGE_STRING='cpuminer-opt 3.18.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.18.1
+cpuminer-opt configure 3.18.2
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.18.1, which was
+It was created by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.18.1'
+ VERSION='3.18.2'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.1, which was
+This file was extended by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.1
+cpuminer-opt config.status 3.18.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.1])
+AC_INIT([cpuminer-opt], [3.18.2])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
-            submit_rate, (double)submitted_share_count*60. /
+            submit_rate, safe_div( (double)submitted_share_count*60.,
-            ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
   if ( accepted_share_count < submitted_share_count )
   {
-      double lost_ghrate = uptime.tv_sec == 0 ? 0.
+      double lost_ghrate = safe_div( target_diff
-                : target_diff
+                    * (double)(submitted_share_count - accepted_share_count ),
-                       * (double)(submitted_share_count - accepted_share_count )
+                    (double)uptime.tv_sec, 0. );
-                  / (double)uptime.tv_sec;
+      double lost_shrate = safe_div( target_diff * (double)(submits - accepts ),                                     share_time, 0. );
      double lost_shrate = share_time == 0. ? 0.
               : target_diff  * (double)(submits - accepts ) / share_time;
      char lshr_units[4] = {0};
      char lghr_units[4] = {0};
      scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2495,18 +2493,21 @@ static void *miner_thread( void *userdata )
             timeval_subtract( &uptime, &total_hashes_time, &session_start ); 
             double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );
-             scale_hash_for_display( &hashrate,  hr_units );
+             if ( hashrate > 0. )
-             sprintf( hr, "%.2f", hashrate );
+             {
                scale_hash_for_display( &hashrate,  hr_units );
                sprintf( hr, "%.2f", hashrate );
 #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
-             applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
+                applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
 #else
-             float lo_freq = 0., hi_freq = 0.;
+                float lo_freq = 0., hi_freq = 0.;
-             linux_cpu_hilo_freq( &lo_freq, &hi_freq );
+                linux_cpu_hilo_freq( &lo_freq, &hi_freq );
-             applog( LOG_NOTICE,
+                applog( LOG_NOTICE,
                     "Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
                     hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
                     hi_freq / 1e6 );
 #endif
             }
          }
       }  // benchmark
@@ -2900,6 +2901,7 @@ static bool cpu_capability( bool display_only )
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
@@ -2907,6 +2909,8 @@ static bool cpu_capability( bool display_only )
     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
     bool use_avx;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
@@ -2976,18 +2980,21 @@ static bool cpu_capability( bool display_only )
     else if ( sw_has_aes    )    printf( "  AES"   );
     if      ( sw_has_sha    )    printf( " SHA"    );
-     printf("\nAlgo features:");
+     if ( !display_only )
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if      ( algo_has_avx512  )  printf( " AVX512" );
+        printf("\nAlgo features:");
-        else if ( algo_has_avx2    )  printf( " AVX2  " );
+        if ( algo_features == EMPTY_SET ) printf( " None" );
-        else if ( algo_has_sse42   )  printf( " SSE4.2" );
+        else
-        else if ( algo_has_sse2    )  printf( " SSE2  " );
+        {
-        if      ( algo_has_vaes ||
+           if      ( algo_has_avx512  )  printf( " AVX512" );
-                  algo_has_vaes256 )  printf( " VAES"   );
+           else if ( algo_has_avx2    )  printf( " AVX2  " );
-        else if ( algo_has_aes     )  printf( "  AES"   );
+           else if ( algo_has_sse42   )  printf( " SSE4.2" );
-        if      ( algo_has_sha     )  printf( " SHA"    );
+           else if ( algo_has_sse2    )  printf( " SSE2  " );
           if      ( algo_has_vaes ||
                     algo_has_vaes256 )  printf( " VAES"   );
           else if ( algo_has_aes     )  printf( "  AES"   );
           if      ( algo_has_sha     )  printf( " SHA"    );
        }
     }
     printf("\n");
@@ -3022,6 +3029,8 @@ static bool cpu_capability( bool display_only )
     // Determine mining options
     use_sse2   = cpu_has_sse2   && algo_has_sse2;
     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
     use_avx    = cpu_has_avx    && sw_has_avx    && algo_has_avx;
     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3038,6 +3047,8 @@ static bool cpu_capability( bool display_only )
     {
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_avx    ) printf( " AVX"    );
        else if ( use_sse42  ) printf( " SSE42"  );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
--- a/miner.h
+++ b/miner.h
@@ -868,9 +868,9 @@ Options:\n\
                          yespowerr16   Yenten (YTN)\n\
                          yespower-b2b  generic yespower + blake2b\n\
                          zr5           Ziftr\n\
-  -N, --param-n         N parameter for scrypt based algos\n\
+  -N, --param-n=N       N parameter for scrypt based algos\n\
-  -R, --param-r         R parameter for scrypt based algos\n\
+  -R, --param-r=N       R parameter for scrypt based algos\n\
-  -K, --param-key       Key (pers) parameter for algos that use it\n\
+  -K, --param-key=STRING  Key (pers) parameter for algos that use it\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
                          long polling is unavailable, in seconds (default: 5)\n\
      --randomize       Randomize scan range start to reduce duplicates\n\
-  -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
+  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
-  -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+  -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
      --hash-meter      Display thread hash rates\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
      --no-getwork      disable getwork support\n\
      --no-gbt          disable getblocktemplate support\n\
      --no-stratum      disable X-Stratum support\n\
-      --no-extranonce   disable Stratum extranonce support\n\
+      --no-extranonce   disable Stratum extranonce subscribe\n\
      --no-redirect     ignore requests to change the URL of the mining server\n\
-  -q, --quiet           disable per-thread hashmeter output\n\
+  -q, --quiet           reduce log verbosity\n\
      --no-color        disable colored output\n\
  -D, --debug           enable debug output\n\
  -P, --protocol-dump   verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
      --max-diff=N      Only mine if net difficulty is less than specified value\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
-      --data-file       path and name of data file\n\
+      --data-file=FILE  path and name of data file\n\
      --verify          enable additional time consuming start up tests\n\
-  -V, --version         display version information and exit\n\
+  -V, --version         display version and CPU information and exit\n\
  -h, --help            display this help text and exit\n\
 ";
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,22 +2,21 @@
 #define SIMD_INT_H__ 1
 // Endian byte swap
-#define bswap_64( a ) __builtin_bswap64( a )
+#define bswap_64    __builtin_bswap64
-#define bswap_32( a ) __builtin_bswap32( a )
+#define bswap_32    __builtin_bswap32
 // Bit rotation
 #define rol64       __rolq
 #define ror64       __rorq
 #define rol32       __rold
 #define ror32       __rord
 // Safe division, integer or floating point. For floating point it's as  
-// safe as 0. is precisely zero.
+// safe as 0 is precisely zero.
-// Returns safe_result if division by zero.
+// Returns safe_result if division by zero, typically zero.
 #define safe_div( dividend, divisor, safe_result ) \
   ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )
 // Aliases with familiar names for built in bit rotate instructions
 #define rol64( a, n )   _lrotl( a, n )  
 #define ror64( a, n )   _lrotr( a, n )
 #define rol32( a, n )   _rotl( a, n )
 #define ror32( a, n )   _rotr( a, n )
 #define rol16( a, n )   _rotwl( a, n )
 #define ror16( a, n )   _rotwr( a, n )
 ///////////////////////////////////////
 //