v3.19.0

2026-07-15 03:16:49 +00:00 · 2021-11-10 21:33:44 -05:00
parent 1a234cbe53
commit e6fd9b1d69
13 changed files with 1198 additions and 829 deletions
--- a/79
+++ b/79
@@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib
   version available in the repositories.

 Download the following source code packages from their respective and
-respected download locations, copy them to ~/usr/lib/ and uncompress them. 
+respected download locations, copy them to $HOME/usr/lib/ and uncompress them. 

 openssl: https://github.com/openssl/openssl/releases

@@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct

 Run cpuminer

-In a command windows change directories to the unzipped release folder. to get a list of all options:
+In a command windows change directories to the unzipped release folder. To get a list of all options:

 cpuminer.exe --help

 Command options are specific to where you mine. Refer to the pool's instructions on how to set them.


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Create a link to the locally compiled version of gmp.h
-
-$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h
-
-Edit configure.ac to fix lipthread package name.
-
-sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
-
-7. Compile
-
-you can use the default compile if you intend to use cpuminer-opt on the
-same CPU and the virtual machine supports that architecture.
-
-./build.sh
-
-Otherwise you can compile manually while setting options in CFLAGS.
-
-Some common options:
-
-To compile for a specific CPU architecture:
-
-CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
-
-This will compile for AMD Ryzen.
-
-You can compile more generically for a set of specific CPU features
-if you know what features you want:
-
-CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
-
-This will compile for an older CPU that does not have AVX.
-
-You can find several examples in build-allarch.sh
-
-If you have a CPU with more than 64 threads and Windows 7 or higher you
-can enable the CPU Groups feature:
-
-D_WIN32_WINNT==0x0601
-
-Once you have run configure successfully run make with n CPU threads:
-
-make -j n
-
-Copy cpuminer.exe to the release directory, compress and copy the release
-directory to a Windows system and run cpuminer.exe from the command line.
-
-Run cpuminer
-
-In a command windows change directories to the unzipped release folder.
-to get a list of all options:
-
-cpuminer.exe --help
-
-Command options are specific to where you mine. Refer to the pool's
-instructions on how to set them.
--- a/17
+++ b/17
@@ -65,7 +65,22 @@ If not what makes it happen or not happen?
 Change Log
 ----------

-v3.8.2
+v3.19.0
+
+Windows binaries now built with support for CPU groups, requires Windows 7.
+
+Changes to cpu-affinity:
+  - PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups,
+  - added support for CPU affinity for up to 256 threads or CPUs,
+  - streamlined code for more efficient initialization of miner threads,
+  - precise affining of each miner thread to a specific CPU,
+  - added an option to disable CPU affinity with "--cpu-affinity 0"
+
+Faster sha256t with AVX512 & AVX2.
+
+Added stratum error count to stats log, reported only when non-zero.
+
+v3.18.2

 Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.

--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -62,6 +62,12 @@ void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
                            const __m128i *state_in );
 void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
                            const __m128i *state_in );
+void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
+                                   const __m128i *W, const __m128i *state_in );
+void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
+        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
+int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
+                                     const __m128i *state_in );

 #endif  // SSE2

@@ -84,10 +90,12 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in );

-void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W,
-                             const __m256i *state_in );
+void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+                                 const __m256i *W, const __m256i *state_in );
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
-                          const __m256i *state_in, const __m256i *state_mid );
+        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
+int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+                                     const __m256i *state_in );

 #endif  // AVX2

@@ -109,10 +117,13 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
                             const __m512i *state_in );
 void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                             const __m512i *state_in );
-void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
-                             const __m512i *state_in );
+void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
+                                  const __m512i *W, const __m512i *state_in );
 void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
-                          const __m512i *state_in, const __m512i *state_mid );
+        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
+
+int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+                                     const __m512i *state_in );

 #endif // AVX512

--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -611,8 +611,8 @@ static inline int scanhash_sha256d_8way_pooler( struct work *work,

 #endif /* HAVE_SHA256_8WAY */

-int scanhash_sha256d_pooler( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_sha256d_pooler( struct work *work,	uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -626,11 +626,11 @@ int scanhash_sha256d_pooler( struct work *work,
   int thr_id = mythr->id;  // thr_id arg is deprecated

 #ifdef HAVE_SHA256_8WAY
-	if (sha256_use_8way())
+	if ( sha256_use_8way() )
 		return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 #ifdef HAVE_SHA256_4WAY
-	if (sha256_use_4way())
+	if ( sha256_use_4way() )
 		return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
 #endif
 	
@@ -695,8 +695,11 @@ bool register_sha256d_algo( algo_gate_t* gate )
   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
+//#elif defined(SHA256D_8WAY)
+//   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
   gate->scanhash = (void*)&scanhash_sha256d_pooler;
+//   gate->scanhash = (void*)&scanhash_sha256d_4way;
 #endif
   //   gate->hash     = (void*)&sha256d;
   return true;
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -36,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
   // initialize state
   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
@@ -49,24 +58,17 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   sha256_16way_transform_le( midstate1, vdata, initstate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );

      // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
-
+      if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
+      {
         // byte swap final hash for testing
         mm512_block_bswap_32( hash32, hash32 );

@@ -80,6 +82,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
+      }
      *noncev = _mm512_add_epi32( *noncev, sixteen );
      n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -95,13 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
   __m256i  midstate1[8] __attribute__ ((aligned (32)));
   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -120,6 +124,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
   // initialize state
   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -133,24 +145,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   sha256_8way_transform_le( midstate1, vdata, initstate );
   
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );

      // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
-
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
+      {
         // byte swap final hash for testing
         mm256_block_bswap_32( hash32, hash32 );

@@ -164,6 +170,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
+       }
       *noncev = _mm256_add_epi32( *noncev, eight );
       n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -179,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
   __m128i  hash32[8]     __attribute__ ((aligned (32)));
   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -203,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+
   // initialize state
   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -214,24 +231,20 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );

   // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate, vdata, initstate );
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );

      // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, initstate );
-
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
+      {
         // byte swap final hash for testing
         mm128_block_bswap_32( hash32, hash32 );

@@ -245,6 +258,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
+      }
      *noncev = _mm_add_epi32( *noncev, four );
      n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -6,12 +6,10 @@

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256D_16WAY 1
-/*
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
 #else
  #define SHA256D_4WAY 1
-*/
 #endif

 bool register_sha256d_algo( algo_gate_t* gate );
@@ -21,7 +19,7 @@ bool register_sha256d_algo( algo_gate_t* gate );
 int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-/*
+
 #if defined(SHA256D_8WAY)

 int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
@@ -33,7 +31,7 @@ int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
-*/
+

 /*
 #if defined(__SHA__)
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -10,13 +10,14 @@
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
+   __m512i  vdata[32]    __attribute__ ((aligned (128)));
   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (32)));
-   __m512i  initstate[8] __attribute__ ((aligned (32)));
-   __m512i  midstate1[8] __attribute__ ((aligned (32)));
-   __m512i  midstate2[8] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m512i  vdata[20]    __attribute__ ((aligned (32)));
+   __m512i  hash32[8]    __attribute__ ((aligned (64)));
+   __m512i  initstate[8] __attribute__ ((aligned (64)));
+   __m512i  midstate1[8] __attribute__ ((aligned (64)));
+   __m512i  midstate2[8] __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -36,7 +37,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

-   // initialize state
+   vdata[16+4] = last_byte;
+   memset_zero_512( vdata+16 + 5, 10 );
+   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+   
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = m512_const1_32( 32*8 ); // bit count
+   
   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
@@ -49,33 +57,26 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   sha256_16way_transform_le( midstate1, vdata, initstate );
   
   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );

   do
   {
-      // 1. final 16 bytes of data, with padding
-      memcpy_512( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_512( block + 5, 10 );  
-      block[15] = m512_const1_32( 80*8 ); // bit count
-      sha256_16way_final_rounds( hash32, block, midstate1, midstate2 );
+      // 1. final 16 bytes of data, pre-padded
+      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                 mexp_pre );

      // 2. 32 byte hash from 1.
-      memcpy_512( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_512( block + 9, 6 );
-      block[15] = m512_const1_32( 32*8 ); // bit count
-      sha256_16way_transform_le( hash32, block, initstate );
+      sha256_16way_transform_le( block, block, initstate );

      // 3. 32 byte hash from 2.
-      memcpy_512( block, hash32, 8 );
-      sha256_16way_transform_le( hash32, block, initstate );
-
+      if ( unlikely(
+               sha256_16way_transform_le_short( hash32, block, initstate ) ) )
+      {
         // byte swap final hash for testing
         mm512_block_bswap_32( hash32, hash32 );    

         for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
         {
            extr_lane_16x32( lane_hash, hash32, lane, 256 );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
@@ -84,6 +85,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
+      }
      *noncev = _mm512_add_epi32( *noncev, sixteen );
      n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -100,13 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m256i  block[16]    __attribute__ ((aligned (64)));
+   __m256i  vdata[32]    __attribute__ ((aligned (64)));
+   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
   __m256i  initstate[8] __attribute__ ((aligned (32)));
   __m256i  midstate1[8] __attribute__ ((aligned (32)));
   __m256i  midstate2[8] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   __m256i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -125,6 +128,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

+   vdata[16+4] = last_byte;
+   memset_zero_256( vdata+16 + 5, 10 );
+   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_256( block + 9, 6 );
+   block[15] = m256_const1_32( 32*8 ); // bit count
+   
   // initialize state
   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
@@ -138,33 +149,26 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   sha256_8way_transform_le( midstate1, vdata, initstate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
   
   do
   {
      // 1. final 16 bytes of data, with padding
-      memcpy_256( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_256( block + 5, 10 );
-      block[15] = m256_const1_32( 80*8 ); // bit count
-      sha256_8way_final_rounds( hash32, block, midstate1, midstate2 );
+      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );

      // 2. 32 byte hash from 1.
-      memcpy_256( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_256( block + 9, 6 );
-      block[15] = m256_const1_32( 32*8 ); // bit count
-      sha256_8way_transform_le( hash32, block, initstate );
+      sha256_8way_transform_le( block, block, initstate );

      // 3. 32 byte hash from 2.
-      memcpy_256( block, hash32, 8 );
-      sha256_8way_transform_le( hash32, block, initstate );
-
+      if ( unlikely(
+               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
+      {
         // byte swap final hash for testing
         mm256_block_bswap_32( hash32, hash32 );

         for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+         if ( hash32_d7[ lane ] <= targ32_d7 )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
@@ -173,6 +177,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
+      }
      *noncev = _mm256_add_epi32( *noncev, eight );
      n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -183,17 +188,23 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,

 #endif

+
 #if defined(SHA256T_4WAY)

+// Optimizations are slower with AVX/SSE2
+// https://github.com/JayDDee/cpuminer-opt/issues/344
+/*
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  block[16]    __attribute__ ((aligned (64)));
+   __m128i  vdata[32]     __attribute__ ((aligned (64)));
+   __m128i  block[16]     __attribute__ ((aligned (32)));
   __m128i  hash32[8]     __attribute__ ((aligned (32)));
   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate1[8]  __attribute__ ((aligned (32)));
+   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
+   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
-   __m128i  vdata[20]    __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
@@ -212,6 +223,101 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
+   // initialize state
+   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   initstate[4] = m128_const1_64( 0x510E527F510E527F );
+   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
+   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+
+   // hash first 64 bytes of data
+   sha256_4way_transform_le( midstate1, vdata, initstate );
+
+   // Do 3 rounds on the first 12 bytes of the next block
+   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
+                                mexp_pre );
+
+      // 2. 32 byte hash from 1.
+      sha256_4way_transform_le( block, block, initstate );
+
+      // 3. 32 byte hash from 2.
+      if ( unlikely(
+              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
+      {   
+         // byte swap final hash for testing
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+         {
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
+         }
+      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
+int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m128i  vdata[32]    __attribute__ ((aligned (64)));
+   __m128i  block[16]    __attribute__ ((aligned (32)));
+   __m128i  hash32[8]    __attribute__ ((aligned (32)));
+   __m128i  initstate[8] __attribute__ ((aligned (32)));
+   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   __m128i *noncev = vdata + 19;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i last_byte = m128_const1_32( 0x80000000 );
+   const __m128i four = m128_const1_32( 4 );
+
+   for ( int i = 0; i < 19; i++ )
+       vdata[i] = m128_const1_32( pdata[i] );
+
+   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+
+   vdata[16+4] = last_byte;
+   memset_zero_128( vdata+16 + 5, 10 );
+   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+
+   block[ 8] = last_byte;
+   memset_zero_128( block + 9, 6 );
+   block[15] = m128_const1_32( 32*8 ); // bit count
+   
   // initialize state
   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
@@ -227,25 +333,9 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,

   do
   {
-      // 1. final 16 bytes of data, with padding
-      memcpy_128( block, vdata + 16, 4 );
-      block[ 4] = last_byte;
-      memset_zero_128( block + 5, 10 );
-      block[15] = m128_const1_32( 80*8 ); // bit count
-      sha256_4way_transform_le( hash32, block, midstate );
-
-      // 2. 32 byte hash from 1.
-      memcpy_128( block, hash32, 8 );
-      block[ 8] = last_byte;
-      memset_zero_128( block + 9, 6 );
-      block[15] = m128_const1_32( 32*8 ); // bit count
+      sha256_4way_transform_le( block,  vdata+16, midstate  );
+      sha256_4way_transform_le( block,  block,    initstate );
      sha256_4way_transform_le( hash32, block,    initstate );
-
-      // 3. 32 byte hash from 2.
-      memcpy_128( block, hash32, 8 );
-      sha256_4way_transform_le( hash32, block, initstate );
-
-      // byte swap final hash for testing
      mm128_block_bswap_32( hash32, hash32 );

      for ( int lane = 0; lane < 4; lane++ )
@@ -266,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

+
 #endif

--- a/compat.h
+++ b/compat.h
@@ -3,6 +3,10 @@

 #ifdef WIN32

+#if _WIN32_WINNT==0x0601    // Windows 7
+ #define WINDOWS_CPU_GROUPS_ENABLED 1
+#endif
+
 #include <windows.h>
 #include <time.h>

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.2'
-PACKAGE_STRING='cpuminer-opt 3.18.2'
+PACKAGE_VERSION='3.18.3'
+PACKAGE_STRING='cpuminer-opt 3.18.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.18.2
+cpuminer-opt configure 3.18.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.18.2, which was
+It was created by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.18.2'
+ VERSION='3.18.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.2, which was
+This file was extended by cpuminer-opt $as_me 3.18.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.2
+cpuminer-opt config.status 3.18.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.2])
+AC_INIT([cpuminer-opt], [3.19.0])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3,7 +3,7 @@
 * Copyright 2012-2014 pooler
 * Copyright 2014 Lucas Jones
 * Copyright 2014-2016 Tanguy Pruvot
- * Copyright 2016-2020 Jay D Dee
+ * Copyright 2016-2021 Jay D Dee
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@@ -115,22 +115,12 @@ int opt_param_n = 0;
 int opt_param_r = 0;
 int opt_n_threads = 0;
 bool opt_sapling = false;
-
-// Windows doesn't support 128 bit affinity mask.
-// Need compile time and run time test.
-#if defined(__linux) && defined(GCC_INT128)  
-#define AFFINITY_USES_UINT128 1
-static uint128_t opt_affinity = -1;
-static bool affinity_uses_uint128 = true;
-#else
-static uint64_t opt_affinity = -1;
-static bool affinity_uses_uint128 = false;
-#endif
-
+static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL;  // default, use all cores
 int opt_priority = 0;  // deprecated
 int num_cpus = 1;
-int num_cpugroups = 1;
-char *rpc_url = NULL;;
+int num_cpugroups = 1;  // For Windows
+#define max_cpus 256   // max for affinity
+char *rpc_url = NULL;
 char *rpc_userpass = NULL;
 char *rpc_user, *rpc_pass;
 char *short_url = NULL;
@@ -166,6 +156,7 @@ uint32_t accepted_share_count = 0;
 uint32_t rejected_share_count = 0;
 uint32_t stale_share_count = 0;
 uint32_t solved_block_count = 0;
+uint32_t stratum_errors = 0;
 double *thr_hashrates;
 double global_hashrate = 0.;
 double total_hashes = 0.;
@@ -227,18 +218,21 @@ char*  lp_id;

 static void   workio_cmd_free(struct workio_cmd *wc);

-static void format_affinity_map( char *map_str, uint64_t map )
+// array mapping thread to cpu
+static uint8_t thread_affinity_map[ max_cpus ];
+
+// display affinity mask graphically
+static void format_affinity_mask( char *mask_str, uint64_t mask )
 {
   int n = num_cpus < 64 ? num_cpus : 64;
   int i;
-
   for ( i = 0; i < n; i++ )
   {
-      if ( map & 1 )  map_str[i] = '!';
-      else            map_str[i] = '.';
-      map >>= 1;
+      if ( mask & 1 )  mask_str[i] = '!';
+      else             mask_str[i] = '.';
+      mask >>= 1;
   }
-   memset( &map_str[i], 0, 64 - i );
+   memset( &mask_str[i], 0, 64 - i );
 }

 #ifdef __linux /* Linux specific policy and affinity management */
@@ -260,93 +254,70 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif

-// Linux affinity can use int128.
-#if AFFINITY_USES_UINT128
-static void affine_to_cpu_mask( int id, uint128_t mask )
-#else
-static void affine_to_cpu_mask( int id, uint64_t mask )
-#endif
+static void affine_to_cpu( struct thr_info *thr )
 {
+   int thread = thr->id;
   cpu_set_t set;
   CPU_ZERO( &set );
-   uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;       
-
-   for ( uint8_t i = 0; i < ncpus; i++ ) 
-   {
-      // cpu mask
-#if AFFINITY_USES_UINT128
-      if( ( mask & ( (uint128_t)1 << i ) ) )  CPU_SET( i, &set );
-#else
-      if( (ncpus > 64) || ( mask & (1 << i) ) )  CPU_SET( i, &set );
-#endif
-   }
-   if ( id == -1 )
-   {
-      // process affinity
-      sched_setaffinity(0, sizeof(&set), &set);
-   }
-   else
-   {
-      // thread only
-      pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
-   }
+   CPU_SET( thread_affinity_map[ thread ], &set );
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d",
+                        thread, thread_affinity_map[ thread ] );
+   pthread_setaffinity_np( thr->pth, sizeof(set), &set );
 }

 #elif defined(WIN32) /* Windows */
+
 static inline void drop_policy(void) { }

 // Windows CPU groups to manage more than 64 CPUs.
-static void affine_to_cpu_mask( int id, uint64_t mask )
+// mask arg is ignored
+static void affine_to_cpu( struct thr_info *thr )
 {
-   bool success;
+   int thread = thr->id;
   unsigned long last_error;    
-//   BOOL success;
-//   DWORD last_error;
+   bool ok;

-   if ( id == -1 )
-      success = SetProcessAffinityMask( GetCurrentProcess(), mask );
-
-// Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
-   else if ( num_cpugroups == 1 )
-	   success = SetThreadAffinityMask( GetCurrentThread(), mask );
-   else
-   {
-	   // Find the correct cpu group
-	   int cpu = id % num_cpus;
-	   int group;
-	   for( group = 0; group < num_cpugroups; group++ )
-	   {
-	      int cpus = GetActiveProcessorCount( group );
- 	      if ( cpu < cpus )  break;
-  	      cpu -= cpus;
-      }
-
-	   if (opt_debug)
-         applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)",
-               id, cpu, group, (1ULL << cpu));
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
+   unsigned long group_size = GetActiveProcessorCount( 0 );
+   unsigned long group      = thread / group_size;
+   unsigned long cpu        = thread_affinity_map[ thread % group_size ];

   GROUP_AFFINITY affinity;
   affinity.Group = group;
   affinity.Mask = 1ULL << cpu;
-	   success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
-   }
+
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d in cpu group %d",
+                        thread, cpu, group );
+
+   ok = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL );
+
 #else
-   else 
-      success = SetThreadAffinityMask( GetCurrentThread(), mask );
+
+   unsigned long cpu = thread_affinity_map[ thread ];
+   uint64_t mask = 1ULL << cpu;
+
+   if ( opt_debug )
+      applog( LOG_INFO, "Binding thread %d to cpu %d", thread, cpu );
+
+   ok = SetThreadAffinityMask( GetCurrentThread(), mask );
+
 #endif

-   if (!success)
+   if ( !ok )
   {
      last_error = GetLastError();
-	   applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x",
-               id, last_error);
+      applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x",
+                           thread, last_error );
   }
 }   

 #else
+
 static inline void drop_policy(void) { }
-static void affine_to_cpu_mask(int id, unsigned long mask) { }
+static void affine_to_cpu( struct thr_info *thr ) { }
+
 #endif

 // not very useful, just index the arrray directly.
@@ -1159,17 +1130,23 @@ void report_summary_log( bool force )
      applog2( prio, "Blocks Solved   %7d      %7d",
               solved, solved_block_count );
   }
+   if ( stratum_errors )
+      applog2( LOG_INFO, "Stratum errors               %7d", stratum_errors );
+
   applog2( LOG_INFO, "Hi/Lo Share Diff  %.5g /  %.5g",
            highest_share, lowest_share );

   int mismatch = submitted_share_count
         - ( accepted_share_count + stale_share_count + rejected_share_count );
+
   if ( mismatch )
   {
-      if ( mismatch != 1 )
-         applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch );
-      else
-         applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N );
+      if ( stratum_errors )
+         applog2( LOG_MINR, "Count mismatch: %d, stats may be inaccurate",
+                            mismatch );
+      else if ( !opt_quiet )
+         applog2( LOG_INFO, CL_LBL
+                  "Count mismatch, submitted share may still be pending" CL_N );
   }
 }

@@ -2241,49 +2218,9 @@ static void *miner_thread( void *userdata )
 	   if ( opt_priority == 0 )
 	      drop_policy();
   }
+
   // CPU thread affinity
-   if ( num_cpus > 1 )
-   {
-#if AFFINITY_USES_UINT128
-      // Default affinity
-      if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 )
-      {  
-         affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) );
-         if ( opt_debug )
-            applog( LOG_INFO, "Binding thread %d to cpu %d.",
-                    thr_id, thr_id % num_cpus,
-	                 u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ),
-		              u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) );
-      }
-#else
-      if ( ( opt_affinity == -1 ) && ( opt_n_threads > 1 ) ) 
-      {
-         affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) );
-         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
-                thr_id, thr_id % num_cpus, 1 << (thr_id % num_cpus)) ;
-      }
-#endif
-      else   // Custom affinity
-      {
-         affine_to_cpu_mask( thr_id, opt_affinity );
-         if ( opt_debug )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog( LOG_INFO, "Binding thread %d to mask %016llx %016llx",
-                                thr_id, u128_hi64( opt_affinity ), 
-                                        u128_lo64( opt_affinity ) );
-            else
-               applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#else
-            applog( LOG_INFO, "Binding thread %d to mask %016llx",
-                                 thr_id, opt_affinity );
-#endif
-         }
-      }
-   }  // num_cpus > 1
+   if ( opt_affinity && num_cpus > 1 )   affine_to_cpu( mythr );

   if ( !algo_gate.miner_thread_init( thr_id ) )
   {
@@ -2792,6 +2729,7 @@ static void *stratum_thread(void *userdata )
      {
          stratum_need_reset = false;
          stratum_down = true;
+          stratum_errors++;
          stratum_disconnect( &stratum );
          if ( strcmp( stratum.url, rpc_url ) )
          {
@@ -2809,6 +2747,7 @@ static void *stratum_thread(void *userdata )
      while ( !stratum.curl )
      {
         stratum_down = true;
+         restart_threads();
         pthread_rwlock_wrlock( &g_work_lock );
         g_work_time = 0;
         pthread_rwlock_unlock( &g_work_lock );
@@ -2830,7 +2769,6 @@ static void *stratum_thread(void *userdata )
         else
         {
            stratum_down = false;
-            restart_threads();
            applog(LOG_BLUE,"Stratum connection established" );
         }
      }
@@ -3137,7 +3075,7 @@ void parse_arg(int key, char *arg )
 {
 	char *p;
 	int v, i;
-	uint64_t ul;
+//	uint64_t ul;
 	double d;

 	switch( key )
@@ -3448,20 +3386,9 @@ void parse_arg(int key, char *arg )
 		break;
 #endif
 	case 1020:  // cpu-affinity
-		p = strstr(arg, "0x");
-		if ( p )
-			ul = strtoull( p, NULL, 16 );
-		else
-			ul = atoll( arg );
-#if AFFINITY_USES_UINT128
-// replicate the low 64 bits to make a full 128 bit mask if there are more
-// than 64 CPUs, otherwise zero extend the upper half.
-         opt_affinity = (uint128_t)ul;
-         if ( num_cpus > 64 )
-            opt_affinity |= opt_affinity << 64;
-#else
-         opt_affinity = ul;
-#endif
+      p = strstr( arg, "0x" );
+      opt_affinity = p ? strtoull( p, NULL, 16 )
+                       : atoll( arg );
      break;
 	case 1021:  // cpu-priority
 		v = atoi(arg);
@@ -3569,14 +3496,12 @@ static void parse_cmdline(int argc, char *argv[])
 #else
      key = getopt(argc, argv, short_options);
 #endif
-	if (key < 0)
-		break;
-
-	parse_arg(key, optarg);
+      if ( key < 0 )   break;
+      parse_arg( key, optarg );
   }
-   if (optind < argc)
+   if ( optind < argc )
   {
-	fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n",
+      fprintf( stderr, "%s: unsupported non-option argument -- '%s'\n",
 		                 argv[0], argv[optind]);
      show_usage_and_exit(1);
   }
@@ -3642,26 +3567,21 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");

-   parse_cmdline(argc, argv);
-
 #if defined(WIN32)
-//	SYSTEM_INFO sysinfo;
-//	GetSystemInfo(&sysinfo);
-//	num_cpus = sysinfo.dwNumberOfProcessors;
-// What happens if GetActiveProcessorGroupCount called if groups not enabled?

 // Are Windows CPU Groups supported?
-#if _WIN32_WINNT==0x0601
+#if defined(WINDOWS_CPU_GROUPS_ENABLED)
 	num_cpus = 0;
 	num_cpugroups = GetActiveProcessorGroupCount();
 	for( i = 0; i < num_cpugroups; i++ )
 	{
- 	   int cpus = GetActiveProcessorCount(i);
+ 	   int cpus = GetActiveProcessorCount( i );
 	   num_cpus += cpus;

 	   if (opt_debug)
-         applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i);
+         applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i );
 	}
+
 #else
   SYSTEM_INFO sysinfo;
   GetSystemInfo(&sysinfo);
@@ -3677,21 +3597,20 @@ int main(int argc, char *argv[])
 #else
 	num_cpus = 1;
 #endif
-	if (num_cpus < 1)
-		num_cpus = 1;

-   if (!opt_n_threads)
-      opt_n_threads = num_cpus;
+   if ( num_cpus < 1 )    num_cpus = 1;
+
+   parse_cmdline( argc, argv );

   if ( opt_algo == ALGO_NULL )
   {
-      fprintf(stderr, "%s: no algo supplied\n", argv[0]);
+      fprintf( stderr, "%s: No algo parameter specified\n", argv[0] );
      show_usage_and_exit(1);
   }

   // need to register to get algo optimizations for cpu capabilities
-   // but that causes register logs before cpu capabilities is output.
-   // Would need to split register into 2 parts. First part sets algo
+   // but that causes registration logs before cpu capabilities is output.
+   // Would need to split register function into 2 parts. First part sets algo
   // optimizations but no logging, second part does any logging.   
   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);

@@ -3735,9 +3654,6 @@ int main(int argc, char *argv[])
         return 1;
 	}

-   // All options must be set before starting the gate
-//   if ( !register_algo_gate( opt_algo, &algo_gate ) )  exit(1);
-
   if ( coinbase_address )
   {
      pk_script_size = address_to_script( pk_script, pk_buffer_size,
@@ -3749,8 +3665,6 @@ int main(int argc, char *argv[])
      }
   }

-//   if ( !check_cpu_capability() ) exit(1);
-
 	pthread_mutex_init( &stats_lock, NULL );
   pthread_rwlock_init( &g_work_lock, NULL );
 	pthread_mutex_init( &stratum.sock_lock, NULL );
@@ -3820,42 +3734,29 @@ int main(int argc, char *argv[])
 	}
 #endif

-// To be confirmed with more than 64 cpus
-   if ( opt_affinity != -1 )
+   if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) )
+      opt_n_threads = num_cpus;
+
+   if ( opt_affinity && num_cpus > max_cpus )
   {
-      if ( !affinity_uses_uint128 && num_cpus > 64 )
-      {
-          applog(LOG_WARNING,"Setting CPU affinity with more than 64 CPUs is only");
-          applog(LOG_WARNING,"available on Linux. Using default affinity.");
-          opt_affinity = -1;
-      }
-/*
-      else	
-      {
-         affine_to_cpu_mask( -1, opt_affinity );
-         if ( !opt_quiet )
-         {
-#if AFFINITY_USES_UINT128
-            if ( num_cpus > 64 )
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) );
-            else 
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#else
-               applog(LOG_DEBUG, "Binding process to cpu mask %x",
-                      opt_affinity );
-#endif
-         }
-      }
-*/
+      applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
+                            max_cpus );
+      opt_affinity = 0ULL;
   }
   
-   if ( !opt_quiet && ( opt_n_threads < num_cpus ) )
+   if ( opt_affinity )
   {
-      char affinity_map[64];
-      format_affinity_map( affinity_map, opt_affinity );
-      applog( LOG_INFO, "CPU affinity [%s]", affinity_map );
+      for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
+      {
+         while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++;   
+         thread_affinity_map[ thr ] = cpu % num_cpus;
+      }
+      if ( !opt_quiet )
+      {
+         char affinity_mask[64];
+         format_affinity_mask( affinity_mask, opt_affinity );
+         applog( LOG_INFO, "CPU affinity [%s]", affinity_mask );
+      }
   }
    
 #ifdef HAVE_SYSLOG_H
@@ -3955,7 +3856,7 @@ int main(int argc, char *argv[])
 			return 1;
 		}
      if ( !opt_quiet )
-         applog( LOG_INFO,"API listnening to %s:%d", opt_api_allow,
+         applog( LOG_INFO,"API listening to %s:%d", opt_api_allow,
                                                     opt_api_listen );
   }

--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -16,13 +16,13 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
 export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
 # used by GCC
 export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
+# support for Windows CPU groups
+export DEFAULT_CFLAGS="-O3 -Wall -D_WIN32_WINNT=0x0601"
+#export DEFAULT_CFLAGS="-O3 -Wall"

 # make link to local gmp header file.
 ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h

-# edit configure to fix pthread lib name for Windows.
-#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
-
 # make release directory and copy selected DLLs.

 rm -rf release > /dev/null
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=icelake-client -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,8 +53,8 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # Rocketlake AVX512 SHA AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS
-#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=cascadelake -msha" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=rocketlake" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha.exe
@@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha.exe
 # Zen1 AVX2 AES SHA
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
@@ -70,8 +70,8 @@ mv cpuminer.exe release/cpuminer-zen.exe
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
-# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=znver2 -mvaes" ./configure $CONFIGURE_ARGS
+# CFLAGS="$DEFAULT_CFLAGS -march=znver3" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen3.exe
@@ -80,7 +80,7 @@ mv cpuminer.exe release/cpuminer-zen3.exe
 # mingw won't compile avx512 without -fno-asynchronous-unwind-tables
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -90,7 +90,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES in -march=core-avx2
-CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=core-avx2 -maes" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
@@ -99,7 +99,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
 make clean || echo clean
 rm -f config.status
 # -march=corei7-avx still includes aes, but just in case
-CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS 
+CFLAGS="$DEFAULT_CFLAGS -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS 
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
@@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -march=westmere -maes" ./configure $CONFIGURE_ARGS
 #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
@@ -116,7 +116,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Nehalem SSE4.2
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=corei7" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
@@ -124,7 +124,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Core2 SSSE3
 #make clean || echo clean
 #rm -f config.status
-#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="$DEFAULT_CFLAGS -march=core2" ./configure $CONFIGURE_ARGS
 #make 
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
@@ -133,7 +133,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
+CFLAGS="$DEFAULT_CFLAGS -msse2" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-sse2.exe