v3.9.2.4

2025-09-17 23:44:27 +00:00 · 2019-06-07 23:30:38 -04:00
15 changed files with 170 additions and 126 deletions
--- a/4
+++ b/4
@@ -38,6 +38,10 @@ supported.
 Change Log
 ----------

+v3.9.2.4
+
+Yet another affinity fix. Hopefully the last one.
+
 v3.9.2.3

 Another cpu-affinity fix.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -363,6 +363,7 @@ void get_algo_alias( char** algo_or_alias )
 #undef ALIAS
 #undef PROPER

+// only for parallel when there are lanes.
 bool submit_solution( struct work *work, void *hash,
                      struct thr_info *thr, int lane )
 {
--- a/algo/fugue/sph_fugue.c
+++ b/algo/fugue/sph_fugue.c
@@ -11,6 +11,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#define SPH_FUGUE_NOCOPY 1
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -127,13 +127,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
        if ( fulltest( hash+(lane<<3), ptarget ) )
        {
           pdata[19] = n + lane;
-           work_set_target_ratio( work, hash+(lane<<3) );
-           if ( submit_work( mythr, work ) )
-               applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-           else
-               applog( LOG_WARNING, "Failed to submit share." );
+           submit_solution( work, hash+(lane<<3), mythr, lane );
         }
     }
     n += 4;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -27,11 +27,15 @@
 // Convert algos that don't yet do so to use dynamic alllocation.
 // Alloc huge pages globally. If ok each thread will create a pointer to
 // its chunk. If fail each thread will use use _mm_alloc for itself. 
+// BLOCK_LEN_BYTES is 768.

 #define LYRA2REV3_NROWS 4
 #define LYRA2REV3_NCOLS 4
-//#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
-//                                                 (LYRA2REV3_NROWS)*8)
+/*
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+                                                 (LYRA2REV3_NROWS)*8)
+*/
+
 #define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)

 __thread uint64_t* l2v3_wholeMatrix;
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
 /*
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -103,13 +103,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( fulltest( lane_hash, ptarget ) )
         {
              pdata[19] = n + lane;    
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-		             accepted_share_count + rejected_share_count + 1,
-			     thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
+              submit_solution( work, lane_hash, mythr, lane );
 	 }
      }
      n += 4;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -194,13 +194,7 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
          pdata[19] = n+i;         
-          work_set_target_ratio( work, hash+(i<<3) );
-          if ( submit_work( mythr, work ) )
-              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, i );
-          else
-              applog( LOG_WARNING, "Failed to submit share." );
+          submit_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -18,38 +18,41 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t hash[8] __attribute__ ((aligned (64))); 
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;

-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
+   if (opt_benchmark)
+	ptarget[7] = 0x0000ff;

-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2z330_hash( hash, endiandata, work->height );
+   for (int i=0; i < 19; i++)
+      be32enc(&endiandata[i], pdata[i]);
        
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
+      {
+         work_set_target_ratio(work, hash);
+         pdata[19] = nonce;
+         if ( submit_work( mythr, work ) )
+             applog( LOG_NOTICE, "Share %d submitted by thread %d",
+                     accepted_share_count + rejected_share_count + 1,
+                     mythr->id );
+         else
+             applog( LOG_WARNING, "Failed to submit share." );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 void lyra2z330_set_target( struct work* work, double job_diff )
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -208,6 +208,15 @@ void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
   Wy[15] =  mm64_bswap_32( iny[15] );
   Wz[15] =       bswap_32( inz[15] );

+   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
+   Bx = rx[1];     By = ry[1];     Bz = rz[1];
+   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
+   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
+   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
+   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
+   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
+   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
+
   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -85,11 +85,11 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
      do
      {
        *noncex = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
        *noncez = bswap_32( n+10 );

-       	pdata[19] = n;
+        pdata[19] = n;

        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );

@@ -102,28 +102,29 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
            mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
            if ( fulltest( lane_hash, ptarget ) )
            {
-	       pdata[19] = n + i;
+	            pdata[19] = n + i;
               submit_solution( work, lane_hash, mythr, i );
            }
-	}
+        }

-	hash7 = &(hashy[7<<1]);
+        hash7 = &(hashy[7<<1]);
        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
+ 
        {
            mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
-  	    if ( fulltest( lane_hash, ptarget ) )
-            {
+           if ( fulltest( lane_hash, ptarget ) )
+           {
               pdata[19] = n + 8 + i;
               submit_solution( work, lane_hash, mythr, i+8 );
-            }
-	 }
+           }
+	     }

-	 if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
-         {
+        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
+        {
            pdata[19] = n+10;
            submit_solution( work, hashz, mythr, 10 );
-         }
-         n += 11;
+        }
+        n += 11;

      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
      break;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -100,25 +100,67 @@
 #include <stdbool.h>

 // First some integer stuff that mirrors the SIMD utilities
+#define ror_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
+#define rol_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
+#define ror_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
+#define rol_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
+#define ror_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
+#define rol_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
+#define ror_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
+#define rol_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )

-#define ror_64( x, c ) (((x)>>(c)) | ((x)<<(64-(c))))
-#define rol_64( x, c ) (((x)<<(c)) | ((x)>>(64-(c))))
-#define ror_32( x, c ) (((x)>>(c)) | ((x)<<(32-(c))))
-#define rol_32( x, c ) (((x)<<(c)) | ((x)>>(32-(c))))
-#define bswap_64( x )  __builtin_bswap64(x)
-#define bswap_32( x )  __builtin_bswap32(x)
+#define bswap_64( x )      __builtin_bswap64(x)
+#define bswap_32( x )      __builtin_bswap32(x)

 // 128 bit integer
+//
+// Int128 uses two 64 bit GPRs to hold the data. The main benefits are
+// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
+// is not required. int128 also works better with other integer sizes.
+// Vectors benefit from wider registers. 
+//
+// Use typecasting for conversion to/from 128 bit vector:
+// __m128i v128 = (__m128i)my_int128l
+// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
+// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );

+#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+
+// Test this before using int128.
+#define GCC_INT128 1
+
+// Familiar looking type names
+typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;

+// No real need or use.
 #define i128_neg1        (uint128_t)(-1LL)
-#define i128_hi64( x )   (uint64_t)( (uint128_t)(x) >> 64 )
-#define i128_lo64( x )   (uint64_t)( (uint128_t)(x) << 64 >> 64 )
+
+// Extract selected 64 bit half of 128 bit integer.
+// A generic macro with a selector argument can't be encoded as a statement
+// function and would require a branch.
+#define i128_hi64( x )    (uint64_t)( (uint128_t)(x) >> 64 )
+#define i128_lo64( x )    (uint64_t)( (uint128_t)(x) << 64 >> 64 )
+
+// Not much need for this but it fills a gap.
+#define ror_128( x, c ) \
+       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
+#define rol_128( x, c ) \
+       ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
+
+#endif  // INT128

 ////////////////////////////////////////////////////////////////
 //
-//         64 bit MMX vectors.
+//               64 bit MMX vectors.
 //
 // There are rumours MMX wil be removed. Although casting with int64
 // works there is likely some overhead to move the data to An MMX register
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.2.3'
-PACKAGE_STRING='cpuminer-opt 3.9.2.3'
+PACKAGE_VERSION='3.9.2.4'
+PACKAGE_STRING='cpuminer-opt 3.9.2.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.2.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.2.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.4:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.2.3
+cpuminer-opt configure 3.9.2.4
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.2.3, which was
+It was created by cpuminer-opt $as_me 3.9.2.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.2.3'
+ VERSION='3.9.2.4'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.2.3, which was
+This file was extended by cpuminer-opt $as_me 3.9.2.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.2.3
+cpuminer-opt config.status 3.9.2.4
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.2.3])
+AC_INIT([cpuminer-opt], [3.9.2.4])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,11 +105,11 @@ enum algos opt_algo = ALGO_NULL;
 int opt_scrypt_n = 0;
 int opt_pluck_n = 128;
 int opt_n_threads = 0;
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+// Windows doesn't support 128 bit affinity mask.
+#if defined(__linux) && defined(GCC_INT128)  
 #define AFFINITY_USES_UINT128 1
-uint128_t opt_affinity = i128_neg1;
+uint128_t opt_affinity = -1LL;
 #else
-#define AFFINITY_USES_UINT128 0
 uint64_t opt_affinity = -1LL;
 #endif
 int opt_priority = 0;
@@ -205,7 +205,8 @@ static inline void drop_policy(void)
 #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
 #endif

-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+// Linux affinity can use int128.
+#if AFFINITY_USES_UINT128
 static void affine_to_cpu_mask( int id, unsigned __int128 mask )
 #else
 static void affine_to_cpu_mask( int id, unsigned long long mask )
@@ -218,7 +219,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
   for ( uint8_t i = 0; i < ncpus; i++ ) 
   {
      // cpu mask
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
+#if AFFINITY_USES_UINT128
      if( ( mask & ( (unsigned __int128)1ULL << i ) ) )  CPU_SET( i, &set );
 #else
      if( (ncpus > 64) || ( mask & (1ULL << i) ) )  CPU_SET( i, &set );
@@ -239,6 +240,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
 #elif defined(WIN32) /* Windows */
 static inline void drop_policy(void) { }

+// Windows CPU groups to manage more than 64 CPUs.
 static void affine_to_cpu_mask( int id, unsigned long mask )
 {
   bool success;
@@ -247,12 +249,12 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
 //   DWORD last_error;

   if ( id == -1 )
-	success = SetProcessAffinityMask( GetCurrentProcess(), (DWORD_PTR)&mask );
+	success = SetProcessAffinityMask( GetCurrentProcess(), mask );

 // Are Windows CPU Groups supported?
 #if _WIN32_WINNT==0x0601
   else if ( num_cpugroups == 1 )
-	success = SetThreadAffinityMask( GetCurrentThread(), (DWORD_PTR)&mask );
+	success = SetThreadAffinityMask( GetCurrentThread(), mask );
   else
   {
 	// Find the correct cpu group
@@ -265,7 +267,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
 	      break;

  	   cpu -= cpus;
-         }
+   }

 	if (opt_debug)
 	applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu));
@@ -277,7 +279,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
   }
 #else
   else 
-        success = SetThreadAffinityMask( GetCurrentThread(), (DWORD_PTR)&mask );
+        success = SetThreadAffinityMask( GetCurrentThread(), mask );
 #endif

   if (!success)
@@ -1848,40 +1850,36 @@ static void *miner_thread( void *userdata )
   if ( num_cpus > 1 )
   {
 #if AFFINITY_USES_UINT128
+       // Default affinity
       if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
       {  
         if ( opt_debug )
-            applog( LOG_DEBUG,
-	  	      "Binding thread %d to cpu %d (mask %016llx %016llx)",
-                      thr_id, thr_id % num_cpus,
-	       	      i128_hi64( i128_neg1 << (thr_id % num_cpus) ),
-		      i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
-         affine_to_cpu_mask( thr_id,
-                             (uint128_t)1LL << (thr_id % num_cpus) );
-
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                    thr_id, thr_id % num_cpus,
+	                 i128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
+		              i128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
+         affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
       }
 #else
       if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) 
       {
         if (opt_debug)
-            applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
-                thr_id, thr_id % num_cpus, L << (thr_id % num_cpus)) ;
+            applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
+                thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ;
         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
       }
 #endif
-      else 
+      else   // Custom affinity
      {
 #if AFFINITY_USES_UINT128
         if (opt_debug)
-             applog( LOG_DEBUG,
-                      "Binding thread %d to cpu mask %016llx %016llx",
-                      thr_id, i128_hi64( i128_neg1 << (thr_id % num_cpus) ), 
-                              i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx",
+                                thr_id, i128_hi64( opt_affinity ), 
+                                        i128_lo64( opt_affinity ) );
 #else
         if (opt_debug)
-             applog( LOG_DEBUG,
-                      "Binding thread %d to cpu mask %016llx %016llx",
-                      thr_id, opt_affinity );
+             applog( LOG_DEBUG, "Binding thread %d to mask %016llx",
+                                 thr_id, opt_affinity );
 #endif
      affine_to_cpu_mask( thr_id, opt_affinity );
      }
@@ -2926,7 +2924,7 @@ void parse_arg(int key, char *arg )
 //		if ( ul > ( 1ULL << num_cpus ) - 1ULL )
 //			ul = -1LL;
 #if AFFINITY_USES_UINT128
-// replicate the low 64 bits to make a full 128 bit maski if there are more
+// replicate the low 64 bits to make a full 128 bit mask if there are more
 // than 64 CPUs, otherwise zero extend the upper half.
                opt_affinity = (uint128_t)ul;
                if ( num_cpus > 64 )
@@ -3332,20 +3330,18 @@ int main(int argc, char *argv[])
 	}

 	if (!rpc_userpass)
-        {
+   {
 		rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-                if (rpc_userpass)
-	           sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
-                else
-                   return 1;
+      if (rpc_userpass)
+          sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
+       else
+         return 1;
 	}

-        // All options must be set before starting the gate
-        if ( !register_algo_gate( opt_algo, &algo_gate ) )
-           exit(1);
+   // All options must be set before starting the gate
+   if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);

-        if ( !check_cpu_capability() )
-           exit(1);
+   if ( !check_cpu_capability() ) exit(1);

 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
@@ -3358,7 +3354,7 @@ int main(int argc, char *argv[])
 	        ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
 	        : CURL_GLOBAL_ALL;
 	if (curl_global_init(flags))
-        {
+   {
 		applog(LOG_ERR, "CURL initialization failed");
 		return 1;
 	}