v3.19.7

v3.19.6
v3.19.5
2025-09-17 23:44:27 +00:00 · 2022-04-02 12:44:57 -04:00 · 2022-02-21 23:14:24 -05:00 · 2022-01-30 20:59:54 -05:00 · 2022-01-12 21:08:25 -05:00
33 changed files with 1468 additions and 583 deletions
--- a/33
+++ b/33
@@ -65,6 +65,39 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v3.19.7
 #369 Fixed time limited mining, --time-limit.
 Fixed a potential compile error when using optimization below -O3.
 v3.19.6
 #363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
 Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
 Small optimization to Shavite.
 v3.19.5
 Enhanced stratum-keepalive preemptively resets the stratum connection
 before the server to avoid lost shares.
 Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
 X16RT: eliminate unnecessary recalculations of the hash order.
 Fix a few compiler warnings.
 Fixed log colour error when a block is solved.
 v3.19.4
 #359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
 New option stratum-keepalive prevents stratum timeouts when no shares are
 submitted for several minutes due to high difficulty.
 Fixed a bug displaying optimizations for some algos.
 v3.19.3
 Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
@@ -344,7 +344,7 @@ static size_t
 detect_cpu(void) {
 	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
 	//cpu_vendors_x86 vendor = cpu_nobody;
-	x86_regs regs;
+	x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
 	uint32_t max_level, max_ext_level;
 	size_t cpu_flags = 0;
 #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
@@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) {
 	#endif
 #endif
-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
 #endif
 /* romix pre/post nop function */
 /*
 static void asm_calling_convention
 scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
 	(void)blocks; (void)nblocks;
 }
-
+*/
 /* romix pre/post endian conversion function */
 static void asm_calling_convention
 scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
 #if !defined(min)
 #define min(a,b) (a>b ? (b) :(a))
 #endif
 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -594,9 +594,6 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
 #define rb6(x)    mm256_rol_64( x, 43 ) 
 #define rb7(x)    mm256_rol_64( x, 53 ) 
 #define rol_off_64( M, j ) \
   mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 #define add_elt_b( mj0, mj3, mj10, h, K ) \
  _mm256_xor_si256( h, _mm256_add_epi64( K, \
              _mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -732,8 +729,23 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
   __m256i mj[16];
-   for ( i = 0; i < 16; i++ )
+
-      mj[i] = rol_off_64( M, i );
+   mj[ 0] = mm256_rol_64( M[ 0],  1 );
   mj[ 1] = mm256_rol_64( M[ 1],  2 );
   mj[ 2] = mm256_rol_64( M[ 2],  3 );
   mj[ 3] = mm256_rol_64( M[ 3],  4 );
   mj[ 4] = mm256_rol_64( M[ 4],  5 );
   mj[ 5] = mm256_rol_64( M[ 5],  6 );
   mj[ 6] = mm256_rol_64( M[ 6],  7 );
   mj[ 7] = mm256_rol_64( M[ 7],  8 );
   mj[ 8] = mm256_rol_64( M[ 8],  9 );
   mj[ 9] = mm256_rol_64( M[ 9], 10 );
   mj[10] = mm256_rol_64( M[10], 11 );
   mj[11] = mm256_rol_64( M[11], 12 );
   mj[12] = mm256_rol_64( M[12], 13 );
   mj[13] = mm256_rol_64( M[13], 14 );
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );
   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
              (const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
@@ -1034,9 +1046,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define r8b6(x)    mm512_rol_64( x, 43 )
 #define r8b7(x)    mm512_rol_64( x, 53 )
 #define rol8w_off_64( M, j ) \
   mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
 #define add_elt_b8( mj0, mj3, mj10, h, K ) \
  _mm512_xor_si512( h, _mm512_add_epi64( K, \
              _mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -1171,41 +1180,73 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
   __m512i mj[16];
-   for ( i = 0; i < 16; i++ )
+   uint64_t K = 16 * 0x0555555555555555ULL;
-      mj[i] = rol8w_off_64( M, i );
+ 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
   mj[ 2] = mm512_rol_64( M[ 2],  3 );
   mj[ 3] = mm512_rol_64( M[ 3],  4 );
   mj[ 4] = mm512_rol_64( M[ 4],  5 );
   mj[ 5] = mm512_rol_64( M[ 5],  6 );
   mj[ 6] = mm512_rol_64( M[ 6],  7 );
   mj[ 7] = mm512_rol_64( M[ 7],  8 );
   mj[ 8] = mm512_rol_64( M[ 8],  9 );
   mj[ 9] = mm512_rol_64( M[ 9], 10 );
   mj[10] = mm512_rol_64( M[10], 11 );
   mj[11] = mm512_rol_64( M[11], 12 );
   mj[12] = mm512_rol_64( M[12], 13 );
   mj[13] = mm512_rol_64( M[13], 14 );
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );
   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
-              (const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
-              (const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
-              (const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
-              (const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
-              (const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
-              (const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
-              (const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
-              (const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
-              (const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
-              (const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
-              (const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
-              (const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
-              (const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
-              (const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
-              (const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   K += 0x0555555555555555ULL;
   qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
-              (const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
+                        (const __m512i)_mm512_set1_epi64( K ) );
   qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
   qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
        uint64_t *data[SHA512_PARALLEL_N],
        uint64_t *digest[SHA512_PARALLEL_N]);
-void sha512ProcessBlock(Sha512Context *context);
+void sha512ProcessBlock(Sha512Context contexti[2] );
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -69,7 +69,6 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
 //   rintrlv_8x32_8x64( vhashA, vhash, 256 );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
   keccak256_8way_close( &ctx.keccak, vhashA);
   keccak256_8way_init( &ctx.keccak );
@@ -284,7 +283,7 @@ void allium_8way_hash( void *hash, const void *input )
   blake256_8way_close( &ctx.blake, vhashA );
   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                     vhashA, 256 );
+                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -49,7 +49,7 @@ void lyra2z_16way_hash( void *state, const void *input )
    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
-               vhash, 256 );
+              vhash, 256 );
    intrlv_2x256( vhash, hash0, hash1, 256 );
    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
 // overlap it's unified.
 // As a result normal is Nrows-2 / Nrows.
 // for 4 rows: 1 unified, 2 overlap, 1 normal.
-// for 8 rows: 1 unified, 2 overlap, 56 normal.
+// for 8 rows: 1 unified, 2 overlap, 5 normal.
 static inline void reducedDuplexRow_2way_normal( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
     io0 = _mm512_load_si512( inout0    );
     io1 = _mm512_load_si512( inout0 +1 );
     io2 = _mm512_load_si512( inout0 +2 );
     io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1    );
     io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
     io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
 /*
     io0 = _mm512_mask_blend_epi64( 0xf0,
                                    _mm512_load_si512( (__m512i*)inout0 ),
                                    _mm512_load_si512( (__m512i*)inout1 ) );
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
     io2 = _mm512_mask_blend_epi64( 0xf0,
                                    _mm512_load_si512( (__m512i*)inout0 +2 ),
                                    _mm512_load_si512( (__m512i*)inout1 +2 ) );
 */
     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
     io0.v512 = _mm512_load_si512( inout0    );
     io1.v512 = _mm512_load_si512( inout0 +1 );
     io2.v512 = _mm512_load_si512( inout0 +2 );
     io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1    );
     io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
     io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
 /*
     io0.v512 = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 ),
                                  _mm512_load_si512( (__m512i*)inout1 ) );
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
     io2.v512 = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
 */
     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
 /* 
     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 ),
                                  _mm512_load_si512( (__m512i*)inout1 ) );
     io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
     io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
 */
     //Applies the reduced-round transformation f to the sponge's state
     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
          io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
       }
 /*
       if ( rowOut == rowInOut0 )
       {
          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
          io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
          io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
       }
       if ( rowOut == rowInOut1 )
       {
          io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
       }
 */
       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
     }
 /*     
      casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
      casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
      casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
     _mm512_mask_store_epi64( inout1,    0xf0, io0.v512 );
     _mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
 */
      casti_m256i( inout0, 0 ) = io0.v256lo;
      casti_m256i( inout1, 1 ) = io0.v256hi;
      casti_m256i( inout0, 2 ) = io1.v256lo;
      casti_m256i( inout1, 3 ) = io1.v256hi;
      casti_m256i( inout0, 4 ) = io2.v256lo;
      casti_m256i( inout1, 5 ) = io2.v256hi;
 /*     
     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
--- a/algo/ripemd/sph_ripemd.c
+++ b/algo/ripemd/sph_ripemd.c
@@ -35,6 +35,7 @@
 #include "sph_ripemd.h"
 #if 0
 /*
 * Round functions for RIPEMD (original).
 */
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
 	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
 };
 #endif
 /*
 * Round functions for RIPEMD-128 and RIPEMD-160.
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {
 #define ROTL    SPH_ROTL32
 #if 0
 /* ===================================================================== */
 /*
 * RIPEMD (original hash, deprecated).
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
 #undef RIPEMD128_IN
 }
 #endif
 /* ===================================================================== */
 /*
 * RIPEMD-160.
--- a/algo/ripemd/sph_ripemd.h
+++ b/algo/ripemd/sph_ripemd.h
@@ -84,6 +84,7 @@
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 #if 0
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
 */
 void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
 #endif
 /* ===================================================================== */
 /**
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };
-
+/*
 #define mm256_ror2x256hi_1x32( a, b ) \
   _mm256_blend_epi32( mm256_shuflr128_32( a ), \
                       mm256_shuflr128_32( b ), 0x88 )
 */
 //#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
 #if defined(__VAES__)
@@ -127,24 +130,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     // round 2, 6, 10
-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p2 = _mm256_xor_si256( p2, x );
-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
     p0 = _mm256_xor_si256( p0, x );
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     // round 4, 8, 12
-     k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
+     k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
-     k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
+     k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
+     k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
+     k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p0 = _mm256_xor_si256( p0, x );
-     k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
+     k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
-     k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
+     k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
+     k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
+     k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
     p2 = _mm256_xor_si256( p2, x );
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };
 #define mm512_ror2x512hi_1x32( a, b ) \
   _mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
                                    mm512_shuflr128_32( b ) )
 static void
 c512_4way( shavite512_4way_context *ctx, const void *msg )
 {
@@ -106,24 +102,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
     // round 2, 6, 10
-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
     P2 = _mm512_xor_si512( P2, X );
-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
     P0 = _mm512_xor_si512( P0, X );
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
     // round 4, 8, 12
-     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
-     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
-     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
-     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
     P0 = _mm512_xor_si512( P0, X );
-     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
-     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
-     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
     P2 = _mm512_xor_si512( P2, X );
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };
 // Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
 // and return the rotated 128 bit vector a.
 // a[3:0] = { b[0], a[3], a[2], a[1] }
 #if defined(__SSSE3__)
 #define mm128_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
 #else  // SSE2
 #define mm128_ror256hi_1x32( a, b ) \
   _mm_or_si128( _mm_srli_si128( a,  4 ), \
                 _mm_slli_si128( b, 12 ) )
 #endif
 /*
 #if defined(__AVX2__)
 // 2 way version of above
 // a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
 #define mm256_ror2x256hi_1x32( a, b ) \
   _mm256_blend_epi32( mm256_ror256_1x32( a ), \
                       mm256_rol256_3x32( b ), 0x88 )
 #endif
 */
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
      // round 2, 6, 10
-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, zero );
      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, zero );
@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
      // round 4, 8, 12
-      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, zero );
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -35,7 +35,7 @@
 #include "sph_shavite.h"
-#if !defined(__AES__)
+#if !(defined(__AES__) && defined(__SSSE3__))
 #ifdef __cplusplus
 extern "C"{
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 //Don't call these directly from application code, use the macros below.
-#ifdef __AES__
+#if defined(__AES__) && defined(__SSSE3__)
 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
--- a/algo/sm3/sph_sm3.h
+++ b/algo/sm3/sph_sm3.h
@@ -74,7 +74,7 @@ typedef struct {
 void sm3_init(sm3_ctx_t *ctx);
 void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
-void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest);
 void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
 void sm3(const unsigned char *data, size_t datalen,
 	unsigned char digest[SM3_DIGEST_LENGTH]);
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -87,16 +87,17 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
    // Allocate data
    info->data = (uint8_t *)malloc_hugepages( fileSize );
    if ( info->data )
       if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
    else
    {
       if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
    }
    else
       info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
-       if (!info->data)
+
-       {
+    if ( !info->data )
-           fclose(fileMiningData);
+    {
-           // Memory allocation fatal error.
+        fclose( fileMiningData );
-           return 2;
+        // Memory allocation fatal error.
-       }
+        return 2;
    }
    // Load data
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -24,15 +24,15 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   if ( bench )   ptarget[7] = 0x0cff;
   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
-   if ( s_ntime != ntime )
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }
   x16r_8way_prehash( vdata, pdata );
@@ -78,15 +78,15 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
   if ( bench )  ptarget[7] = 0x0cff;
   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = bswap_32( pdata[17] );
+   uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
-   if ( s_ntime != ntime )
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                            x16r_hash_order, bswap_32( pdata[17] ), timeHash );
   }
   x16r_4way_prehash( vdata, pdata );
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -20,15 +20,15 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
   mm128_bswap32_80( edata, pdata );
   static __thread uint32_t s_ntime = UINT32_MAX;
-   uint32_t ntime = swab32( pdata[17] );
+   uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
-   if ( s_ntime != ntime )
+   if ( s_ntime != masked_ntime )
   {
-      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
-      s_ntime = ntime;
+      s_ntime = masked_ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               x16r_hash_order, ntime, timeHash );
+                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
   x16r_prehash( edata, pdata );
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -36,8 +36,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
 # AVX2 SHA AES: AMD Zen1
 make clean || echo done
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
-#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
--- a/build-msys2.sh
+++ b/build-msys2.sh
@@ -0,0 +1,10 @@
 #!/bin/bash
 #
 # Compile on Windows using MSYS2 and MinGW.
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
 make -j 4
 strip -s cpuminer
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.19.3'
+PACKAGE_VERSION='3.19.7'
-PACKAGE_STRING='cpuminer-opt 3.19.3'
+PACKAGE_STRING='cpuminer-opt 3.19.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.19.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.19.7 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.19.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.19.7:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.19.3
+cpuminer-opt configure 3.19.7
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.19.3, which was
+It was created by cpuminer-opt $as_me 3.19.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.19.3'
+ VERSION='3.19.7'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.19.3, which was
+This file was extended by cpuminer-opt $as_me 3.19.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.19.3
+cpuminer-opt config.status 3.19.7
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.19.3])
+AC_INIT([cpuminer-opt], [3.19.7])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,8 +105,9 @@ bool opt_randomize = false;
 static int opt_retries = -1;
 static int opt_fail_pause = 10;
 static int opt_time_limit = 0;
 static unsigned int time_limit_stop = 0;
 int opt_timeout = 300;
-static int opt_scantime = 5;
+static int opt_scantime = 0;
 const int min_scantime = 1;
 //static const bool opt_time = true;
 enum algos opt_algo = ALGO_NULL;
@@ -127,6 +128,12 @@ char *short_url = NULL;
 char *coinbase_address;
 char *opt_data_file = NULL;
 bool opt_verify = false;
 static bool opt_stratum_keepalive = false;
 static struct timeval stratum_keepalive_timer;
 // Stratum typically times out in 5 minutes or 300 seconds
 #define stratum_keepalive_timeout 180  // 3 minutes
 static struct timeval stratum_reset_time;
 // pk_buffer_size is used as a version selector by b58 code, therefore
 // it must be set correctly to work.
@@ -187,7 +194,6 @@ int default_api_listen = 4048;
 static struct   timeval session_start;
 static struct   timeval five_min_start;
 static uint64_t session_first_block = 0;
 static double   latency_sum = 0.;
 static uint64_t submit_sum  = 0;
 static uint64_t accept_sum  = 0;
 static uint64_t stale_sum  = 0;
@@ -336,6 +342,7 @@ void get_currentalgo(char* buf, int sz)
 void proper_exit(int reason)
 {
   if (opt_debug) applog(LOG_INFO,"Program exit");
 #ifdef WIN32
 	if (opt_background) {
 		HWND hcon = GetConsoleWindow();
@@ -1143,7 +1150,7 @@ void report_summary_log( bool force )
               solved, solved_block_count );
   }
   if ( stratum_errors )
-      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+      applog2( LOG_INFO, "Stratum resets               %7d", stratum_errors );
   applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
            highest_share, lowest_share );
@@ -1274,7 +1281,6 @@ static int share_result( int result, struct work *work,
      else          reject_sum++;
   }
   submit_sum++;
   latency_sum += latency;
   pthread_mutex_unlock( &stats_lock );
@@ -1290,9 +1296,9 @@ static int share_result( int result, struct work *work,
     else              rcol = CL_LRD;
   }
-   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
+   applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)",
           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
-           bres, share_time, latency );
+           bres, CL_N, share_time, latency );
   if ( unlikely( opt_debug || !result || solved ) )
   {
@@ -2110,7 +2116,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   {
      unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
                                             g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
+      applog( LOG_INFO, "Extranonce2 0x%s, Block %d, Job %s",
                        xnonce2str, sctx->block_height, g_work->job_id );
      free( xnonce2str );
   }
@@ -2197,8 +2203,6 @@ static void *miner_thread( void *userdata )
 //                      : 0;
   uint32_t end_nonce = 0xffffffffU / opt_n_threads  * (thr_id + 1) - 0x20;
   time_t   firstwork_time = 0;
   int  i;
   memset( &work, 0, sizeof(work) );
   /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
@@ -2242,7 +2246,7 @@ static void *miner_thread( void *userdata )
   if ( !algo_gate.miner_thread_init( thr_id ) )
   {
-      applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
+      applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id );
      exit (1);
   }
@@ -2270,22 +2274,34 @@ static void *miner_thread( void *userdata )
          {
             while ( unlikely( stratum_down ) )
                sleep( 1 );
-             if ( *nonceptr >= end_nonce )
+             if ( unlikely( ( *nonceptr >= end_nonce )
-                stratum_gen_work( &stratum, &g_work );
+                         && !work_restart[thr_id].restart ) )
             {
                if ( opt_extranonce )
                   stratum_gen_work( &stratum, &g_work );
                else
                {
                   if ( !thr_id )
                   {
                      applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
                      applog( LOG_WARNING, "waiting for new work...");
                   }
                   while ( !work_restart[thr_id].restart )
                      sleep ( 1 );
                }
             }
          }
-          else
+          else if ( !opt_benchmark ) // GBT or getwork
          {
             pthread_rwlock_wrlock( &g_work_lock );
-             if ( ( ( time(NULL) - g_work_time )
+             if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
                 >= ( have_longpoll ? LP_SCANTIME : opt_scantime ) )
               || ( *nonceptr >= end_nonce ) )
             {
                if ( unlikely( !get_work( mythr, &g_work ) ) )
                {
                   pthread_rwlock_unlock( &g_work_lock );
-		             applog( LOG_ERR, "work retrieval failed, exiting "
+		             applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
 		                              "mining thread %d", thr_id );
 		             goto out;
 	             }
                g_work_time = time(NULL);
@@ -2308,25 +2324,14 @@ static void *miner_thread( void *userdata )
       if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
          continue;
-// LP_SCANTIME overrides opt_scantime option, is this right?
+       // opt_scantime expressed in hashes
-
+       max64 = opt_scantime * thr_hashrates[thr_id];
       // adjust max_nonce to meet target scan time. Stratum and longpoll
       // can go longer because they can rely on restart_threads to signal
       // an early abort. get_work on the other hand can't rely on
       // restart_threads so need a much shorter scantime
       if ( have_stratum )
          max64 = 60 * thr_hashrates[thr_id];
       else if ( have_longpoll )
          max64 = LP_SCANTIME * thr_hashrates[thr_id];
       else  // getwork inline
          max64 = opt_scantime * thr_hashrates[thr_id];   
       // time limit
-       if ( unlikely( opt_time_limit && firstwork_time ) )
+       if ( unlikely( opt_time_limit ) )
       {
-          int passed = (int)( time(NULL) - firstwork_time );
+          unsigned int now = (unsigned int)time(NULL);
-          int remain = (int)( opt_time_limit - passed );
+          if ( now >= time_limit_stop )
          if ( remain < 0 )
          {
             if ( thr_id != 0 )
             {
@@ -2338,14 +2343,16 @@ static void *miner_thread( void *userdata )
                char rate[32];
                format_hashrate( global_hashrate, rate );
                applog( LOG_NOTICE, "Benchmark: %s", rate );
                fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate);
             }
             else
-                applog( LOG_NOTICE,
+                applog( LOG_NOTICE, "Mining timeout of %ds reached, exiting...",
-	          "Mining timeout of %ds reached, exiting...", opt_time_limit);
+                        opt_time_limit);
-	       proper_exit(0);
+
             proper_exit(0);
          }
-          if ( remain < max64 ) max64 = remain;
+          // else
          if ( time_limit_stop - now < opt_scantime )
              max64 = ( time_limit_stop - now ) * thr_hashrates[thr_id] ;
       }
       // Select nonce range based on max64, the estimated number of hashes
@@ -2361,8 +2368,6 @@ static void *miner_thread( void *userdata )
          max_nonce = work_nonce + (uint32_t)max64;
       // init time
       if ( firstwork_time == 0 )
          firstwork_time = time(NULL);
       hashes_done = 0;
       gettimeofday( (struct timeval *) &tv_start, NULL );
@@ -2435,7 +2440,7 @@ static void *miner_thread( void *userdata )
       {
          double hashrate  = 0.;
          pthread_mutex_lock( &stats_lock );
-          for ( i = 0; i < opt_n_threads; i++ )
+          for ( int i = 0; i < opt_n_threads; i++ )
              hashrate  += thr_hashrates[i];
          global_hashrate  = hashrate;
          pthread_mutex_unlock( &stats_lock );
@@ -2729,6 +2734,18 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
          sctx->job.final_sapling_hash );
 }
 // Loop is out of order:
 //
 //   connect/reconnect
 //   handle message
 //   get new message
 //
 // change to
 //   connect/reconnect
 //   get new message
 //   handle message
 static void *stratum_thread(void *userdata )
 {
   struct thr_info *mythr = (struct thr_info *) userdata;
@@ -2746,6 +2763,7 @@ static void *stratum_thread(void *userdata )
      if ( unlikely( stratum_need_reset ) )
      {
          stratum_need_reset = false;
          gettimeofday( &stratum_reset_time, NULL );
          stratum_down = true;
          stratum_errors++;
          stratum_disconnect( &stratum );
@@ -2756,7 +2774,7 @@ static void *stratum_thread(void *userdata )
 	          applog(LOG_BLUE, "Connection changed to %s", short_url);
          }
          else 
-	          applog(LOG_WARNING, "Stratum connection reset");
+	          applog(LOG_BLUE, "Stratum connection reset");
          // reset stats queue as well
          restart_threads();
          if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
@@ -2788,15 +2806,12 @@ static void *stratum_thread(void *userdata )
         {
            stratum_down = false;
            applog(LOG_BLUE,"Stratum connection established" );
            if ( stratum.new_job )   // prime first job
               stratum_gen_work( &stratum, &g_work );
         }
      }
-      report_summary_log( ( stratum_diff != stratum.job.diff )
+      // Wait for new message from server
                       && ( stratum_diff != 0. ) );
      if ( stratum.new_job )
         stratum_gen_work( &stratum, &g_work );
      if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
      {
         if ( likely( s = stratum_recv_line( &stratum ) ) )
@@ -2819,6 +2834,53 @@ static void *stratum_thread(void *userdata )
 //         stratum_disconnect( &stratum );
      }
      report_summary_log( ( stratum_diff != stratum.job.diff )
                       && ( stratum_diff != 0. ) );
      if ( !stratum_need_reset )
      {
         // Is keepalive needed? Mutex would normally be required but that
         // would block any attempt to submit a share. A share is more
         // important even if it messes up the keepalive.
         if ( opt_stratum_keepalive )
         {
            struct timeval now, et;
            gettimeofday( &now, NULL );
            // any shares submitted since last keepalive?
            if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
               memcpy( &stratum_keepalive_timer, &last_submit_time,
                       sizeof (struct timeval) );
            timeval_subtract( &et, &now, &stratum_keepalive_timer );
            if ( et.tv_sec > stratum_keepalive_timeout )
            {
                double diff = stratum.job.diff * 0.5;
                stratum_keepalive_timer = now;
                if ( !opt_quiet )
                   applog( LOG_BLUE,
                           "Stratum keepalive requesting lower difficulty" );
                stratum_suggest_difficulty( &stratum, diff );
            }
            if ( last_submit_time.tv_sec > stratum_reset_time.tv_sec )
              timeval_subtract( &et, &now, &last_submit_time );
            else
              timeval_subtract( &et, &now, &stratum_reset_time );
            if ( et.tv_sec > stratum_keepalive_timeout + 60 )
            {
               applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
               stratum_need_reset = true;
               stratum_keepalive_timer = now;
            }
         } // stratum_keepalive
         if ( stratum.new_job && !stratum_need_reset )
            stratum_gen_work( &stratum, &g_work );
      } // stratum_need_reset
   }  // loop
 out:
  return NULL;
@@ -2990,8 +3052,8 @@ static bool cpu_capability( bool display_only )
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
-     use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
+     use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
-                   use_sha || use_vaes );
+                || use_avx2 || use_sha || use_vaes );
     // Display best options
     printf( "\nStarting miner with" );
@@ -3407,7 +3469,8 @@ void parse_arg(int key, char *arg )
      break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
-		if (v < 0 || v > 5)	/* sanity check */
+      applog(LOG_NOTICE,"--cpu-priority is deprecated and will be removed from a future release");
      if (v < 0 || v > 5)	/* sanity check */
 			show_usage_and_exit(1);
 		opt_priority = v;
 		break;
@@ -3443,14 +3506,18 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1024:
 		opt_randomize = true;
-		break;
+      applog(LOG_NOTICE,"--randomize is deprecated and will be removed from a future release");
      break;
   case 1027:  // data-file
      opt_data_file = strdup( arg );
      break;
   case 1028:  // verify
      opt_verify = true;
      break;
-	case 'V':
+   case 1029:  // stratum-keepalive
      opt_stratum_keepalive = true;
      break;
   case 'V':
      display_cpu_capability();
      exit(0);
 	case 'h':
@@ -3625,6 +3692,17 @@ int main(int argc, char *argv[])
      show_usage_and_exit(1);
   }
   if ( !opt_scantime )
   {
      if      ( have_stratum )  opt_scantime = 30;
      else if ( have_longpoll ) opt_scantime = LP_SCANTIME;
      else                      opt_scantime = 5;
   }
   if ( opt_time_limit )
      time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
   // need to register to get algo optimizations for cpu capabilities
   // but that causes registration logs before cpu capabilities is output.
   // Would need to split register function into 2 parts. First part sets algo
@@ -3833,6 +3911,8 @@ int main(int argc, char *argv[])
      if ( opt_debug )
         applog(LOG_INFO,"Creating stratum thread");
      stratum.new_job = false;  // just to make sure
      /* init stratum thread info */
 		stratum_thr_id = opt_n_threads + 2;
 		thr = &thr_info[stratum_thr_id];
@@ -3899,6 +3979,8 @@ int main(int argc, char *argv[])
   gettimeofday( &last_submit_time, NULL );
   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
   memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) );
   memcpy( &stratum_reset_time, &last_submit_time, sizeof (struct timeval) );
   memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
   pthread_mutex_unlock( &stats_lock );
--- a/miner.h
+++ b/miner.h
@@ -466,6 +466,7 @@ void stratum_disconnect(struct stratum_ctx *sctx);
 bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff );
 extern bool aes_ni_supported;
@@ -823,6 +824,7 @@ Options:\n\
                          qubit         Qubit\n\
                          scrypt        scrypt(1024, 1, 1) (default)\n\
                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptn2      scrypt(1048576, 1,1)\n\
                          sha256d       Double SHA-256\n\
                          sha256q       Quad SHA-256, Pyrite (PYE)\n\
                          sha256t       Triple SHA-256, Onecoin (OC)\n\
@@ -885,10 +887,10 @@ Options:\n\
  -T, --timeout=N       timeout for long poll and stratum (default: 300 seconds)\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
                          long polling is unavailable, in seconds (default: 5)\n\
-      --randomize       Randomize scan range start to reduce duplicates\n\
+      --randomize       randomize scan range (deprecated)\n\
-  -f, --diff-factor=N   Divide req. difficulty by this factor (std is 1.0)\n\
+  -f, --diff-factor=N   divide req. difficulty by this factor (std is 1.0)\n\
  -m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
-      --hash-meter      Display thread hash rates\n\
+      --hash-meter      display thread hash rates\n\
      --coinbase-addr=ADDR  payout address for solo mining\n\
      --coinbase-sig=TEXT  data to insert in the coinbase when possible\n\
      --no-longpoll     disable long polling support\n\
@@ -909,15 +911,16 @@ Options:\n\
  -B, --background      run the miner in the background\n\
      --benchmark       run in offline benchmark mode\n\
      --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
-      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)\n\
+      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest) (deprecated)\n\
  -b, --api-bind=address[:port]   IP address for the miner API, default port is 4048)\n\
-      --api-remote      Allow remote control\n\
+      --api-remote      allow remote control\n\
-      --max-temp=N      Only mine if cpu temp is less than specified value (linux)\n\
+      --max-temp=N      only mine if cpu temp is less than specified value (linux)\n\
-      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
+      --max-rate=N[KMG] only mine if net hashrate is less than specified value\n\
-      --max-diff=N      Only mine if net difficulty is less than specified value\n\
+      --max-diff=N      only mine if net difficulty is less than specified value\n\
  -c, --config=FILE     load a JSON-format configuration file\n\
      --data-file=FILE  path and name of data file\n\
      --verify          enable additional time consuming start up tests\n\
      --stratum-keepalive  prevent disconnects when difficulty is too high\n\
  -V, --version         display version and CPU information and exit\n\
  -h, --help            display this help text and exit\n\
 ";
@@ -987,6 +990,7 @@ static struct option const options[] = {
        { "userpass", 1, NULL, 'O' },
        { "data-file", 1, NULL, 1027 },
        { "verify", 0, NULL, 1028 },
        { "stratum-keepalive", 0, NULL, 1029 },
        { "version", 0, NULL, 'V' },
        { 0, 0, 0, 0 }
 };
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -272,9 +272,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #endif
 // Mask making
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
 // Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
 #define mm_movmask_64( v ) \
   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
 #define mm_movmask_32( v ) \
   _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
-// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+// Diagonal blend
 // Blend 4 32 bit elements from 4 vectors
@@ -284,7 +294,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
-#elif defined(__SSE4_1)
+#elif defined(__SSE4_1__)
 #define mm128_diagonal_32( v3, v2, v1, v0 ) \
  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
@@ -401,6 +411,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
 // Limited 2 input shuffle
 #define mm128_shuffle2_64( a, b, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
                                     _mm_castsi128_pd( b ), c ) ); 
 #define mm128_shuffle2_32( a, b, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
                                     _mm_castsi128_ps( b ), c ) ); 
 //
 // Rotate vector elements accross all lanes
@@ -532,9 +552,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #if defined(__SSSE3__)
 // Function macro with two inputs and one output, inputs are preserved.
 // Returns modified first arg.
 // Two input functions are not available without SSSE3. Use procedure
-// belowe instead.
+// macros below instead.
 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -548,12 +567,11 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
-// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
+// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
 // Returns both modified args in place.
 // These macros retain the vrol/vror name for now to avoid
 // confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 1nputs and
+// These may be renamed to something like shufl2r2 for 2 nputs and
 // 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
 #define mm128_vror256_64( v1, v2 ) \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -233,6 +233,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #endif
 // Mask making
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
 // Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
 #define mm256_movmask_32( v ) \
   _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
 // Diagonal blending
 // Blend 4 64 bit elements from 4 vectors
@@ -405,6 +417,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 // Limited 2 input shuffle
 #define mm256_shuffle2_64( a, b, c ) \
   _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
                                           _mm256_castsi256_pd( b ), c ) ); 
 #define mm256_shuffle2_32( a, b, c ) \
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
                                           _mm256_castsi256_ps( b ), c ) ); 
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -485,20 +507,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
   v2 = _mm256_xor_si256( v1, v2 ); \
   v1 = _mm256_xor_si256( v1, v2 );
 #define mm256_vror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)
 #define mm256_vrol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
   v1 = t; \
 } while(0)
 #endif // __AVX2__
 #endif // SIMD_256_H__
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -493,7 +493,7 @@ static inline __m512i mm512_shufll_32( const __m512i v )
 static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }
-static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
+static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }
 #define mm512_shuflr_16( v ) \
@@ -581,8 +581,17 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )
 //
-// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
 // Limited 2 input, 1 output shuffle within 128 bit lanes.
 #define mm512_shuffle2_64( a, b, c ) \
   _mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
                                           _mm512_castsi512_pd( b ), c ) ); 
 #define mm512_shuffle2_32( a, b, c ) \
   _mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
                                           _mm512_castsi512_ps( b ), c ) ); 
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
 #define mm512_shuflr128_64  mm512_swap128_64
@@ -610,6 +619,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions
 // 2 input, 1 output
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always
@@ -627,76 +637,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 #define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
 #define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
 // Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.
 #define mm512_swap1024_512( v1, v2 ) \
   v1 = _mm512_xor_si512( v1, v2 ); \
   v2 = _mm512_xor_si512( v1, v2 ); \
   v1 = _mm512_xor_si512( v1, v2 );
 #define mm512_shufl2l_512 mm512_swap1024_512 \
 #define mm512_shufl2r_512 mm512_swap1024_512 \
 // Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
 // for now.
 //  Rotate elements from 2 512 bit vectors in place, both source arguments
 //  are updated.
 #define mm512_vror1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v2 = t; \
 } while(0)
 #define mm512_vrol1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v1 = t; \
 } while(0)
 #define mm512_vror1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
   v2 = t; \
 } while(0)
 #define mm512_vrol1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
   v1 = t; \
 } while(0)
 #define mm512_vror1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
 #define mm512_vrol1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
   v1 = t; \
 } while(0)
 #define mm512_vror1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
 #define mm512_vrol1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
   v1 = t; \
 } while(0)
 #endif // AVX512
 #endif // SIMD_512_H__
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -209,7 +209,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
 {
   memset(outbuf, 0, maxsz);
 #ifdef WIN32
-   char brand[0xC0] = { 0 };
+   char brand[256] = { 0 };
   int output[4] = { 0 }, ext;
   cpuid(0x80000000, output);
   ext = output[0];
--- a/util.c
+++ b/util.c
@@ -1658,7 +1658,7 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	pthread_mutex_unlock(&sctx->work_lock);
   if ( !opt_quiet ) /* pool dynamic change */
-      applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
+      applog( LOG_INFO, "Stratum extranonce1 0x%s, extranonce2 size %d",
         xnonce1, xn2_size);
 	return true;
@@ -1846,6 +1846,25 @@ out:
 	return ret;
 }
 bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff )
 {
   char *s;
   s = (char*) malloc( 80 );
   bool rc = true;
   // response is handled seperately, what ID?
   sprintf( s, "{\"id\": 1, \"method\": \"mining.suggest_difficulty\", \"params\": [\"%f\"]}", diff );
   if ( !stratum_send_line( sctx, s ) )
   {
      applog(LOG_WARNING,"stratum.suggest_difficulty send failed");
      rc = false;
   } 
   free ( s );
   return rc;
 }
 /**
 * Extract bloc height     L H... here len=3, height=0x1333e8
 * "...0000000000ffffffff2703e83313062f503253482f043d61105408"
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,8 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
-# support for Windows CPU groups, AES sometimes not included in -march
+# Support for Windows 7 CPU groups, AES sometimes not included in -march
-export DEFAULT_CFLAGS="-O3 -maes -Wall -D_WIN32_WINNT=0x0601"
+export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
 export DEFAULT_CFLAGS_OLD="-O3 -Wall"
 # make link to local gmp header file.
@@ -26,8 +26,8 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 # make release directory and copy selected DLLs.
 rm -rf release > /dev/null
 mkdir release
 cp README.txt release/
 cp README.md release/
 cp RELEASE_NOTES release/
Author	SHA1	Message	Date
Jay D Dee	db76d3865f	v3.19.7	2022-04-02 12:44:57 -04:00
Jay D Dee	5b678d2481	v3.19.6	2022-02-21 23:14:24 -05:00
Jay D Dee	90137b391e	v3.19.5	2022-01-30 20:59:54 -05:00
Jay D Dee	8727d79182	v3.19.4	2022-01-12 21:08:25 -05:00