v3.18.2

2025-09-17 23:44:27 +00:00 · 2021-10-19 22:35:36 -04:00
parent 47cc5dcff5
commit 1a234cbe53
18 changed files with 474 additions and 189 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha256d.c \
  algo/sha/sha2.c \
+  algo/sha/sha256d-4way.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
  algo/sha/sha256t.c \
--- a/14
+++ b/14
@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.8.2
+
+Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
+
+AVX512 for sha256d.
+
+SSE42 and AVX may now be displayed as mining features at startup.
+This is hard coded for each algo, and is only implemented for scrypt
+at this time as it is the only algo with significant performance differences
+with those features.
+
+Fixed an issue where a high hashrate algo could cause excessive invalid hash
+rate log reports when starting up in benchmark mode.
+
 v3.18.1

 More speed for scrypt:
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -337,42 +337,42 @@ do{ \
   XC2 = XOR( XC2, TC ); \
 \
   TA = ADD32( XA2, XA1 ); \
+   XA1 = ROL_1X32( XA1 ); \
   TB = ADD32( XB2, XB1 ); \
   TC = ADD32( XC2, XC1 ); \
-   TA = ROL32( TA, 13 ); \
-   XA1 = ROL_1X32( XA1 ); \
   XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
+   TA = ROL32( TA, 13 ); \
   XA3 = XOR( XA3, TA ); \
+   XC1 = ROL_1X32( XC1 ); \
   TB = ROL32( TB, 13 ); \
   XB3 = XOR( XB3, TB ); \
   TC = ROL32( TC, 13 ); \
   XC3 = XOR( XC3, TC ); \
 \
   TA = ADD32( XA3, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
   TB = ADD32( XB3, XB2 ); \
   TC = ADD32( XC3, XC2 ); \
   TA = ROL32( TA, 18 ); \
-   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
   XA0 = XOR( XA0, TA ); \
   TB = ROL32( TB, 18 ); \
   XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
   TC = ROL32( TC, 18 ); \
   XC0 = XOR( XC0, TC ); \
 \
   TA = ADD32( XA0, XA1 ); \
+   XA3 = ROR_1X32( XA3 ); \
   TB = ADD32( XB0, XB1 ); \
   TC = ADD32( XC0, XC1 ); \
   TA = ROL32( TA, 7 ); \
-   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
   XA3 = XOR( XA3, TA ); \
   TB = ROL32( TB, 7 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
   XB3 = XOR( XB3, TB ); \
   TC = ROL32( TC, 7 ); \
-   XC3 = ROR_1X32( XC3 ); \
   XC3 = XOR( XC3, TC ); \
 \
   TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
   XC1 = XOR( XC1, TC ); \
 \
   TA = ADD32( XA1, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
   TB = ADD32( XB1, XB2 ); \
+   XB2 = SWAP_64( XB2 ); \
   TA = ROL32( TA, 18); \
   TC = ADD32( XC1, XC2 ); \
-   XA2 = SWAP_64( XA2 ); \
+   XC2 = SWAP_64( XC2 ); \
   TB = ROL32( TB, 18); \
   XA0 = XOR( XA0, TA ); \
-   XB2 = SWAP_64( XB2 ); \
+   XA1 = ROR_1X32( XA1 ); \
   TC = ROL32( TC, 18); \
   XB0 = XOR( XB0, TB ); \
-   XC2 = SWAP_64( XC2 ); \
-   XA1 = ROR_1X32( XA1 ); \
   XB1 = ROR_1X32( XB1 ); \
   XC0 = XOR( XC0, TC ); \
   XC1 = ROR_1X32( XC1 ); \
 } while (0);
   

-// slow rol, an attempt to optimze non-avx512 bit rotations
+// slow rot, an attempt to optimze non-avx512 bit rotations
 // Contains target specific instructions, only for use with 128 bit vectors
 #define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -28,7 +28,6 @@
 */

 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
@@ -55,11 +54,25 @@ static const uint32_t sha256_initial_state[8] =
  0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };

-static int scrypt_throughput = 0;
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define SCRYPT_THROUGHPUT 16
+
+#elif defined(__AVX2__)
+
+#define SCRYPT_THROUGHPUT 8
+
+#else
+
+#define SCRYPT_THROUGHPUT 4
+
+#endif
+
+// static int scrypt_throughput = 0;

 static int scratchbuf_size = 0;

-static __thread char *scratchbuf = NULL;
+static __thread uint32_t *scratchbuf = NULL;

 // change this to a constant to be used directly  as input state arg
 // vectors still need an init function.
@@ -709,15 +722,11 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,

 #endif // AVX512

-//#if defined(USE_ASM) && defined(__x86_64__)
-
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
-//int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V, int N);
 void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);

-//#if defined(USE_AVX2)
 #if defined(__AVX2__)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
@@ -727,40 +736,39 @@ void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);

 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
-//#define scrypt_best_throughput() 1
 #endif

 #include "scrypt-core-4way.h"

-static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad, int N, int thr_id )
+/*
+static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
+                              uint32_t *midstate, int N, int thr_id )
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
-	uint32_t *V = (uint32_t*)scratchpad;
 	
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);

-   scrypt_core_simd128( X, V, N );  // woring
+   scrypt_core_simd128( X, scratchbuf, N );  // woring
 //   scrypt_core_1way( X, V, N );  // working
 //   scrypt_core(X, V, N);

 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
   return true;
 }
+*/

-#if defined(__AVX2__)
+#if ( SCRYPT_THROUGHPUT == 8 )

 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                  uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 8*8 ];
   uint32_t _ALIGN(128) ostate[ 8*8 ];
   uint32_t _ALIGN(128) W[ 8*32 ];
   uint32_t _ALIGN(128) X[ 8*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;

   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
                   input+80, input+100, input+120, input+140, 640 );
@@ -774,11 +782,11 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   
   if ( opt_param_n > 0x4000 )
   {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
   }
   else
   {
@@ -786,13 +794,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
      intrlv_2x128( W+128, X+128, X+160, 1024 );
      intrlv_2x128( W+192, X+192, X+224, 1024 );
-      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)scratchbuf, N );
      dintrlv_2x128( X,     X+ 32, W,     1024 );
      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
      dintrlv_2x128( X+128, X+160, W+128, 1024 );
@@ -928,16 +936,15 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,

 #endif  // AVX2

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if ( SCRYPT_THROUGHPUT == 16 )

 static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                   uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 16*8 ];
   uint32_t _ALIGN(128) ostate[ 16*8 ];
   uint32_t _ALIGN(128) W[ 16*32 ]; 
   uint32_t _ALIGN(128) X[ 16*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;

   intrlv_16x32( W, input,     input+ 20, input+ 40, input+ 60,
                    input+ 80, input+100, input+120, input+140,
@@ -956,17 +963,17 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,

   if ( opt_param_n > 0x4000 )
   {
-      scrypt_core_simd128_3buf( X,     V, N );
+      scrypt_core_simd128_3buf( X,     scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+ 96, V, N );
+      scrypt_core_simd128_3buf( X+ 96, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+192, V, N );
+      scrypt_core_simd128_2buf( X+192, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+256, V, N );
+      scrypt_core_simd128_3buf( X+256, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_3buf( X+352, V, N );
+      scrypt_core_simd128_3buf( X+352, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+448, V, N );
+      scrypt_core_simd128_2buf( X+448, scratchbuf, N );
   }
   else
   {
@@ -974,13 +981,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
-      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)scratchbuf, N );
      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
@@ -1236,15 +1243,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,

 #endif // AVX512

-#if defined(__SHA__)
-
+#if 0
 static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+                                      uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 2*8 ];
    uint32_t _ALIGN(128) ostate[ 2*8 ];
    uint32_t _ALIGN(128) W[ 2*32 ];
-    uint32_t *V = (uint32_t*)scratchpad;

    memcpy( tstate,    midstate, 32 );
    memcpy( tstate+ 8, midstate, 32 );
@@ -1254,7 +1259,7 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
                                   input, input+20,  W, W+32 );

-    scrypt_core_simd128_2buf( W, V, N );
+    scrypt_core_simd128_2buf( W, scratchbuf, N );
    if ( work_restart[thrid].restart ) return 0;

    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
@@ -1264,12 +1269,11 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
 }

 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[4 * 8];
    uint32_t _ALIGN(128) ostate[4 * 8];
    uint32_t _ALIGN(128) W[4 * 32];
-    uint32_t *V = (uint32_t*)scratchpad;

    memcpy( tstate,    midstate, 32 );
    memcpy( tstate+ 8, midstate, 32 );
@@ -1300,9 +1304,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 */

   // working, double buffered linear simd
-   scrypt_core_simd128_2buf( W, V, N );
+   scrypt_core_simd128_2buf( W, scratchbuf, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_2buf( W+64, V, N );
+   scrypt_core_simd128_2buf( W+64, scratchbuf, N );

 /*
   scrypt_core_simd128_3buf( W, V, N );
@@ -1323,17 +1327,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,

   return 1;
 }
+#endif

-#else
-
-#ifdef HAVE_SHA256_4WAY
+#if ( SCRYPT_THROUGHPUT == 4 )
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
-           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+           uint32_t *midstate, int N, int thrid )
 {
   uint32_t _ALIGN(128) tstate[ 4*8 ];
   uint32_t _ALIGN(128) ostate[ 4*8 ];
   uint32_t _ALIGN(128) W[ 4*32 ];
-   uint32_t *V = (uint32_t*)scratchpad;

   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
   for ( int i = 0; i < 8; i++ )
@@ -1346,13 +1348,13 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   {
      uint32_t _ALIGN(128) X[ 4*32 ];
      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
-      scrypt_core_simd128_2buf( X, V, N );
+      scrypt_core_simd128_2buf( X, scratchbuf, N );
      if ( work_restart[thrid].restart ) return 0;
-      scrypt_core_simd128_2buf( X+64, V, N );
+      scrypt_core_simd128_2buf( X+64, scratchbuf, N );
      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
   }
   else
-      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+      scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );



@@ -1398,65 +1400,73 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,

   return 1;
 }
-#endif /* HAVE_SHA256_4WAY */
+#endif   // SCRYPT_THROUGHPUT == 4

-#endif // SHA
+//#endif // SHA

 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
+   uint32_t _ALIGN(64) hash[  8*SCRYPT_THROUGHPUT ];
+   uint32_t _ALIGN(64) data[ 20*SCRYPT_THROUGHPUT ];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
   uint32_t midstate[8];
   uint32_t n = pdata[19] - 1;
   int thr_id = mythr->id;  
-   int throughput = scrypt_throughput;
   int i;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-   for ( i = 0; i < throughput; i++ )
+   for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
      memcpy( data + i * 20, pdata, 80 );

   sha256_transform_le( midstate, data, sha256_initial_state );

   do {
      bool rc = true;
-      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ ) data[ i*20 + 19 ] = ++n;

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-      if ( throughput == 16 )
-         rc = scrypt_N_1_1_256_16way( data, hash, midstate, scratchbuf,
-                                      opt_param_n, thr_id );
-      else
-#endif
-#if defined(__AVX2__)      
-      if ( throughput == 8 )      
-         rc = scrypt_N_1_1_256_8way( data, hash, midstate, scratchbuf,
-                                     opt_param_n, thr_id );
-      else
-#endif
-      if ( throughput == 4 ) // slower on Ryzen than 8way
-#if defined(__SHA__)
-         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
-                                         opt_param_n, thr_id );
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if ( SCRYPT_THROUGHPUT == 16 )
+//      if ( SCRYPT_THROUGHPUT == 16 )
+         rc = scrypt_N_1_1_256_16way( data, hash, midstate, opt_param_n,
+                                      thr_id );
+//      else
+//#endif
+//#if defined(__AVX2__)      
+#elif ( SCRYPT_THROUGHPUT == 8 )
+//         if ( SCRYPT_THROUGHPUT == 8 )      
+         rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
+                                     thr_id );
+//      else
+//#endif
+#elif ( SCRYPT_THROUGHPUT == 4 )
+//      if ( SCRYPT_THROUGHPUT == 4 ) // slower on Ryzen than 8way
+//#if defined(__SHA__)
+//         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
+//                                         thr_id );
+//#else
+         rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
+                                     thr_id );
 #else
-         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
-                                     opt_param_n, thr_id );
+
+#error "Invalid SCRYPT_THROUGHPUT"
+
 #endif
+/*
 #if defined(__SHA__)
      else
-      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
-         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
-                                         opt_param_n, thr_id );
+      if ( SCRYPT_THROUGHPUT == 2 )  // slower on Ryzen than 4way_sha & 8way
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
+                                         thr_id );
 #endif         
      else  // should never get here
-         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
-                                opt_param_n, thr_id );
+         rc = scrypt_N_1_1_256( data, hash, midstate, opt_param_n, thr_id );
+*/

      // test the hash
      if ( rc )
-      for ( i = 0; i < throughput; i++ )
+      for ( i = 0; i < SCRYPT_THROUGHPUT; i++ )
      {
         if ( unlikely( valid_hash( hash + i*8, ptarget ) && !opt_benchmark ) )
         {
@@ -1468,7 +1478,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
      }


-   } while ( likely( ( n < ( max_nonce - throughput ) ) && !(*restart) ) );
+   } while ( likely( ( n < ( max_nonce - SCRYPT_THROUGHPUT ) ) && !(*restart) ) );
 	
 	*hashes_done = n - pdata[19];
 	pdata[19] = n;
@@ -1489,7 +1499,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 //#if defined(__SHA__)
 //   gate->optimizations = SSE2_OPT | SHA_OPT;
 //#else
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
 //#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
@@ -1497,8 +1507,11 @@ bool register_scrypt_algo( algo_gate_t* gate )
   opt_param_n = opt_param_n ? opt_param_n : 1024;
   applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );

+// scrypt_throughput can be defined at compile time and used to replace
+// MAX_WAYS to reduce memory usage.
+   
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   scrypt_throughput = 16;
+//   scrypt_throughput = 16;
   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else      
@@ -1511,13 +1524,13 @@ bool register_scrypt_algo( algo_gate_t* gate )
 */

 #elif defined(__AVX2__)
-   scrypt_throughput = 8;   
+//   scrypt_throughput = 8;   
   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else
      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
-   scrypt_throughput = 4;
+//   scrypt_throughput = 4;
   if ( opt_param_n > 0x4000 )
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
   else
@@ -1533,7 +1546,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
   format_number_si( &d_size, d_units );
   
   applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
-          scrypt_throughput, t_size, t_units, d_size, d_units );
+          SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );

   return true;
 };
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -84,6 +84,11 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in );

+void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
+                             const __m256i *state_in );
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+                          const __m256i *state_in, const __m256i *state_mid );
+
 #endif  // AVX2

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -8,7 +8,7 @@
 * any later version.  See COPYING for more details.
 */

-#include "algo-gate-api.h"
+#include "sha256d-4way.h"

 #include <string.h>
 #include <inttypes.h>
@@ -181,6 +181,8 @@ static const uint32_t sha256d_hash1[16] = {
 };

 // this performs the entire hash all over again, why?
+// because main function only does 56 rounds.
+
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
@@ -492,7 +494,7 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);

-static inline int scanhash_sha256d_4way( struct work *work,
+static inline int scanhash_sha256d_4way_pooler( struct work *work,
             uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -553,7 +555,7 @@ static inline int scanhash_sha256d_4way( struct work *work,
 void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);

-static inline int scanhash_sha256d_8way( struct work *work,
+static inline int scanhash_sha256d_8way_pooler( struct work *work,
            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -609,7 +611,7 @@ static inline int scanhash_sha256d_8way( struct work *work,

 #endif /* HAVE_SHA256_8WAY */

-int scanhash_sha256d( struct work *work,
+int scanhash_sha256d_pooler( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -625,11 +627,11 @@ int scanhash_sha256d( struct work *work,

 #ifdef HAVE_SHA256_8WAY
 	if (sha256_use_8way())
-		return scanhash_sha256d_8way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_8way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
-		return scanhash_sha256d_4way( work,	max_nonce, hashes_done, mythr );
+		return scanhash_sha256d_4way_pooler( work,	max_nonce, hashes_done, mythr );
 #endif
 	
 	memcpy(data, pdata + 16, 64);
@@ -690,9 +692,13 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,

 bool register_sha256d_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX2_OPT;
-   gate->scanhash = (void*)&scanhash_sha256d;
-//   gate->hash     = (void*)&sha256d;
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256D_16WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_16way;
+#else
+   gate->scanhash = (void*)&scanhash_sha256d_pooler;
+#endif
+   //   gate->hash     = (void*)&sha256d;
   return true;
 };

--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -548,6 +548,136 @@ void sha256_8way_init( sha256_8way_context *sc )
   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 }

+// Aggresive prehashing, LE byte order
+void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
+                             const __m256i *state_in )
+{
+   __m256i A, B, C, D, E, F, G, H;
+
+   A = _mm256_load_si256( state_in     );
+   B = _mm256_load_si256( state_in + 1 );
+   C = _mm256_load_si256( state_in + 2 );
+   D = _mm256_load_si256( state_in + 3 );
+   E = _mm256_load_si256( state_in + 4 );
+   F = _mm256_load_si256( state_in + 5 );
+   G = _mm256_load_si256( state_in + 6 );
+   H = _mm256_load_si256( state_in + 7 );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
+#endif
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+   _mm256_store_si256( state_mid    , A );
+   _mm256_store_si256( state_mid + 1, B );
+   _mm256_store_si256( state_mid + 2, C );
+   _mm256_store_si256( state_mid + 3, D );
+   _mm256_store_si256( state_mid + 4, E );
+   _mm256_store_si256( state_mid + 5, F );
+   _mm256_store_si256( state_mid + 6, G );
+   _mm256_store_si256( state_mid + 7, H );
+}
+
+void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+                          const __m256i *state_in, const __m256i *state_mid )
+{
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   memcpy_256( W, data, 16 );
+
+   A = _mm256_load_si256( state_mid     );
+   B = _mm256_load_si256( state_mid + 1 );
+   C = _mm256_load_si256( state_mid + 2 );
+   D = _mm256_load_si256( state_mid + 3 );
+   E = _mm256_load_si256( state_mid + 4 );
+   F = _mm256_load_si256( state_mid + 5 );
+   G = _mm256_load_si256( state_mid + 6 );
+   H = _mm256_load_si256( state_mid + 7 );
+
+//   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+//   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+//   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+
+#if !defined(__AVX512VL__)
+   __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
+#endif
+
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   A = _mm256_add_epi32( A, _mm256_load_si256( state_in     ) );
+   B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) );
+   C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) );
+   D = _mm256_add_epi32( D, _mm256_load_si256( state_in + 3 ) );
+   E = _mm256_add_epi32( E, _mm256_load_si256( state_in + 4 ) );
+   F = _mm256_add_epi32( F, _mm256_load_si256( state_in + 5 ) );
+   G = _mm256_add_epi32( G, _mm256_load_si256( state_in + 6 ) );
+   H = _mm256_add_epi32( H, _mm256_load_si256( state_in + 7 ) );
+
+   _mm256_store_si256( state_out    ,  A );
+   _mm256_store_si256( state_out + 1,  B );
+   _mm256_store_si256( state_out + 2,  C );
+   _mm256_store_si256( state_out + 3,  D );
+   _mm256_store_si256( state_out + 4,  E );
+   _mm256_store_si256( state_out + 5,  F );
+   _mm256_store_si256( state_out + 6,  G );
+   _mm256_store_si256( state_out + 7,  H );
+}
+
+
+
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.

--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -53,4 +53,8 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
 #define sha256_transform_be sph_sha256_transform_be

 #endif
+
+// SHA can't do only 3 rounds
+#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
+
 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -1,4 +1,4 @@
-#include "sha256t-gate.h"
+#include "sha256d-4way.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,7 +13,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   __m512i  block[16]    __attribute__ ((aligned (64)));
   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );

-   // hash first 64 byte block of data
-   sha256_16way_transform_le( midstate, vdata, initstate );
+   sha256_16way_transform_le( midstate1, vdata, initstate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );

   do
   {
@@ -59,7 +58,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_512( block + 5, 10 );
      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );

      // 2. 32 byte hash from 1.
      memcpy_512( block, hash32, 8 );
@@ -99,7 +98,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i  block[16]    __attribute__ ((aligned (64)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -130,8 +130,10 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );

-   // hash first 64 bytes of data
-   sha256_8way_transform_le( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+   
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );

   do
   {
@@ -140,7 +142,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_256( block + 5, 10 );
      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );

      // 2. 32 byte hash from 1.
      memcpy_256( block, hash32, 8 );
@@ -253,3 +255,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,

 #endif

+/*
+bool register_sha256d_algo( algo_gate_t* gate )
+{
+   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256D_16WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_16way;
+#elif defined(SHA256D_8WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_8way;
+#elif defined(SHA256D_4WAY)
+   gate->scanhash = (void*)&scanhash_sha256d_4way;
+#endif
+   
+//   gate->hash     = (void*)&sha256d;
+   return true;
+};
+*/
+
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -0,0 +1,48 @@
+#ifndef __SHA256D_4WAY_H__
+#define __SHA256D_4WAY_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256D_16WAY 1
+/*
+#elif defined(__AVX2__)
+  #define SHA256D_8WAY 1
+#else
+  #define SHA256D_4WAY 1
+*/
+#endif
+
+bool register_sha256d_algo( algo_gate_t* gate );
+
+#if defined(SHA256D_16WAY)
+
+int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+/*
+#if defined(SHA256D_8WAY)
+
+int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
+#if defined(SHA256D_4WAY)
+
+int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+*/
+
+/*
+#if defined(__SHA__)
+
+int scanhash_sha256d( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+*/
+
+#endif
+
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -13,7 +13,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   __m512i  block[16]    __attribute__ ((aligned (64)));
   __m512i  hash32[8]    __attribute__ ((aligned (32)));
   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate[8]  __attribute__ ((aligned (32)));
+   __m512i  midstate1[8] __attribute__ ((aligned (32)));
   __m512i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m512i  vdata[20]    __attribute__ ((aligned (32)));
@@ -46,11 +46,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );

-   // hash first 64 byte block of data
-   sha256_16way_transform_le( midstate, vdata, initstate );
+   sha256_16way_transform_le( midstate1, vdata, initstate );
   
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
+   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );

   do
   {
@@ -59,7 +58,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_512( block + 5, 10 );  
      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
+      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );

      // 2. 32 byte hash from 1.
      memcpy_512( block, hash32, 8 );
@@ -104,7 +103,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   __m256i  block[16]    __attribute__ ((aligned (64)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate[8]  __attribute__ ((aligned (32)));
+   __m256i  midstate1[8] __attribute__ ((aligned (32)));
+   __m256i  midstate2[8] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
@@ -135,8 +135,10 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );

-   // hash first 64 bytes of data
-   sha256_8way_transform_le( midstate, vdata, initstate );
+   sha256_8way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
   
   do
   {
@@ -145,7 +147,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
      block[ 4] = last_byte;
      memset_zero_256( block + 5, 10 );
      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, midstate );
+      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );

      // 2. 32 byte hash from 1.
      memcpy_256( block, hash32, 8 );
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -702,6 +702,36 @@ memcpy( state_out, state_in, 32 );

 }

+void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
+                                 const uint32_t *state_in )
+{
+   uint32_t t1, t2, X_xor_Y, Y_xor_Z = state_in[1] ^ state_in[2];
+   memcpy( state_out, state_in, 32 );
+
+   t1 = state_out[7] + BSG2_1( state_out[4] )
+      + CH( state_out[4], state_out[5], state_out[6] ) + 0x428A2F98 + data[0];
+   t2 = BSG2_0( state_out[0] )
+      + MAJ( state_out[0], state_out[1], state_out[2] );
+   Y_xor_Z = X_xor_Y;
+   state_out[3] += t1;
+   state_out[7] = t1 + t2;
+
+   t1 = state_out[6] + BSG2_1( state_out[3] ) 
+      + CH( state_out[3], state_out[4], state_out[5] ) + 0x71374491 + data[1];
+   t2 = BSG2_0( state_out[7] )
+      + MAJ( state_out[7], state_out[0], state_out[1] );
+   Y_xor_Z = X_xor_Y;
+   state_out[2] += t1;
+   state_out[6] = t1 + t2;
+
+   t1 = state_out[5] + BSG2_1( state_out[2] )
+      + CH( state_out[2], state_out[3], state_out[4] ) + 0xB5C0FBCF + data[2];
+   t2 = BSG2_0( state_out[6] )
+      + MAJ( state_out[6], state_out[7], state_out[0] );
+   state_out[1] += t1;
+   state_out[5] = t1 + t2;
+}   
+
 /* see sph_sha2.h */
 void
 sph_sha224_init(void *cc)
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -215,6 +215,9 @@ void sph_sha256_transform_le( uint32_t *state_out, const uint32_t *data,
 void sph_sha256_transform_be( uint32_t *state_out, const uint32_t *data,
                              const uint32_t *state_in );

+void sph_sha256_prehash_3rounds( uint32_t *state_out, const uint32_t *data,
+                                 const uint32_t *state_in );
+

 #if SPH_64

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.1'
-PACKAGE_STRING='cpuminer-opt 3.18.1'
+PACKAGE_VERSION='3.18.2'
+PACKAGE_STRING='cpuminer-opt 3.18.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.18.1
+cpuminer-opt configure 3.18.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.18.1, which was
+It was created by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.18.1'
+ VERSION='3.18.2'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.1, which was
+This file was extended by cpuminer-opt $as_me 3.18.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.1
+cpuminer-opt config.status 3.18.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.1])
+AC_INIT([cpuminer-opt], [3.18.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1112,19 +1112,17 @@ void report_summary_log( bool force )
   applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
   applog2( LOG_NOTICE, "Periodic Report     %s        %s", et_str, upt_str );
   applog2( LOG_INFO, "Share rate        %.2f/min     %.2f/min",
-            submit_rate, (double)submitted_share_count*60. /
-            ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
+            submit_rate, safe_div( (double)submitted_share_count*60.,
+              ( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ), 0. ) );
   applog2( LOG_INFO, "Hash rate       %7.2f%sh/s   %7.2f%sh/s   (%.2f%sh/s)",
            shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );

   if ( accepted_share_count < submitted_share_count )
   {
-      double lost_ghrate = uptime.tv_sec == 0 ? 0.
-                : target_diff
-                       * (double)(submitted_share_count - accepted_share_count )
-                  / (double)uptime.tv_sec;
-      double lost_shrate = share_time == 0. ? 0.
-               : target_diff  * (double)(submits - accepts ) / share_time;
+      double lost_ghrate = safe_div( target_diff
+                    * (double)(submitted_share_count - accepted_share_count ),
+                    (double)uptime.tv_sec, 0. );
+      double lost_shrate = safe_div( target_diff * (double)(submits - accepts ),                                     share_time, 0. );
      char lshr_units[4] = {0};
      char lghr_units[4] = {0};
      scale_hash_for_display( &lost_shrate, lshr_units );
@@ -2495,6 +2493,8 @@ static void *miner_thread( void *userdata )
             timeval_subtract( &uptime, &total_hashes_time, &session_start ); 
             double hashrate = safe_div( total_hashes, uptime.tv_sec, 0. );

+             if ( hashrate > 0. )
+             {
                scale_hash_for_display( &hashrate,  hr_units );
                sprintf( hr, "%.2f", hashrate );
 #if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
@@ -2508,6 +2508,7 @@ static void *miner_thread( void *userdata )
                     hi_freq / 1e6 );
 #endif
             }
+          }
       }  // benchmark

       // conditional mining
@@ -2900,6 +2901,7 @@ static bool cpu_capability( bool display_only )
     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
@@ -2907,6 +2909,8 @@ static bool cpu_capability( bool display_only )
     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
     bool use_aes;
     bool use_sse2;
+     bool use_sse42;
+     bool use_avx;
     bool use_avx2;
     bool use_avx512;
     bool use_sha;
@@ -2976,6 +2980,8 @@ static bool cpu_capability( bool display_only )
     else if ( sw_has_aes    )    printf( "  AES"   );
     if      ( sw_has_sha    )    printf( " SHA"    );

+     if ( !display_only )
+     {
        printf("\nAlgo features:");
        if ( algo_features == EMPTY_SET ) printf( " None" );
        else
@@ -2989,6 +2995,7 @@ static bool cpu_capability( bool display_only )
           else if ( algo_has_aes     )  printf( "  AES"   );
           if      ( algo_has_sha     )  printf( " SHA"    );
        }
+     }
     printf("\n");

     if ( display_only ) return true;
@@ -3022,6 +3029,8 @@ static bool cpu_capability( bool display_only )

     // Determine mining options
     use_sse2   = cpu_has_sse2   && algo_has_sse2;
+     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
+     use_avx    = cpu_has_avx    && sw_has_avx    && algo_has_avx;
     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
@@ -3038,6 +3047,8 @@ static bool cpu_capability( bool display_only )
     {
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
+        else if ( use_avx    ) printf( " AVX"    );
+        else if ( use_sse42  ) printf( " SSE42"  );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
--- a/miner.h
+++ b/miner.h
@@ -868,9 +868,9 @@ Options:\n\
                          yespowerr16   Yenten (YTN)\n\
                          yespower-b2b  generic yespower + blake2b\n\
                          zr5           Ziftr\n\
-  -N, --param-n         N parameter for scrypt based algos\n\
-  -R, --param-r         R parameter for scrypt based algos\n\
-  -K, --param-key       Key (pers) parameter for algos that use it\n\
+  -N, --param-n=N       N parameter for scrypt based algos\n\
+  -R, --param-r=N       R parameter for scrypt based algos\n\
+  -K, --param-key=STRING  Key (pers) parameter for algos that use it\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
@@ -886,8 +886,8 @@ Options:\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
                          long polling is unavailable, in seconds (default: 5)\n\
      --randomize       Randomize scan range start to reduce duplicates\n\
-  -f, --diff-factor     Divide req. difficulty by this factor (std is 1.0)\n\
-  -m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
+  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
+  -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
      --hash-meter      Display thread hash rates\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
@@ -895,9 +895,9 @@ Options:\n\
      --no-getwork      disable getwork support\n\
      --no-gbt          disable getblocktemplate support\n\
      --no-stratum      disable X-Stratum support\n\
-      --no-extranonce   disable Stratum extranonce support\n\
+      --no-extranonce   disable Stratum extranonce subscribe\n\
      --no-redirect     ignore requests to change the URL of the mining server\n\
-  -q, --quiet           disable per-thread hashmeter output\n\
+  -q, --quiet           reduce log verbosity\n\
      --no-color        disable colored output\n\
  -D, --debug           enable debug output\n\
  -P, --protocol-dump   verbose dump of protocol-level activities\n"
@@ -916,9 +916,9 @@ Options:\n\
      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
      --max-diff=N      Only mine if net difficulty is less than specified value\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
-      --data-file       path and name of data file\n\
+      --data-file=FILE  path and name of data file\n\
      --verify          enable additional time consuming start up tests\n\
-  -V, --version         display version information and exit\n\
+  -V, --version         display version and CPU information and exit\n\
  -h, --help            display this help text and exit\n\
 ";

--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,22 +2,21 @@
 #define SIMD_INT_H__ 1

 // Endian byte swap
-#define bswap_64( a ) __builtin_bswap64( a )
-#define bswap_32( a ) __builtin_bswap32( a )
+#define bswap_64    __builtin_bswap64
+#define bswap_32    __builtin_bswap32
+
+// Bit rotation
+#define rol64       __rolq
+#define ror64       __rorq
+#define rol32       __rold
+#define ror32       __rord

 // Safe division, integer or floating point. For floating point it's as  
-// safe as 0. is precisely zero.
-// Returns safe_result if division by zero.
+// safe as 0 is precisely zero.
+// Returns safe_result if division by zero, typically zero.
 #define safe_div( dividend, divisor, safe_result ) \
   ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )

-// Aliases with familiar names for built in bit rotate instructions
-#define rol64( a, n )   _lrotl( a, n )  
-#define ror64( a, n )   _lrotr( a, n )
-#define rol32( a, n )   _rotl( a, n )
-#define ror32( a, n )   _rotr( a, n )
-#define rol16( a, n )   _rotwl( a, n )
-#define ror16( a, n )   _rotwr( a, n )

 ///////////////////////////////////////
 //