v3.9.2.3

v3.9.2.2
v3.9.2.1
2025-09-17 23:44:27 +00:00 · 2019-06-05 12:20:04 -04:00 · 2019-06-04 17:14:03 -04:00 · 2019-06-04 16:56:44 -04:00
15 changed files with 1012 additions and 87 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -163,6 +163,7 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha2-hash-4way.c \
+  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
--- a/10
+++ b/10
@@ -38,6 +38,16 @@ supported.
 Change Log
 ----------

+v3.9.2.3
+
+Another cpu-affinity fix.
+Disabled test code that fails to compile on some CPUs with limited
+AVX512 capabilities.
+
+v3.9.2.2
+
+Fixed some day one cpu-affinity issues.
+
 v3.9.2

 Added sha256q algo.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -345,9 +345,9 @@ const char* const algo_alias_map[][2] =
  { NULL,                NULL           }   
 };

-// if arg is a valid alias for a known algo it is updated with the proper name.
-// No validation of the algo or alias is done, It is the responsinility of the
-// calling function to validate the algo after return.
+// if arg is a valid alias for a known algo it is updated with the proper
+// name. No validation of the algo or alias is done, It is the responsinility
+// of the calling function to validate the algo after return.
 void get_algo_alias( char** algo_or_alias )
 {
  int i;
@@ -362,3 +362,21 @@ void get_algo_alias( char** algo_or_alias )

 #undef ALIAS
 #undef PROPER
+
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane )
+{
+     work_set_target_ratio( work, hash );
+     if ( submit_work( thr, work ) )
+     {
+         applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
+                 accepted_share_count + rejected_share_count + 1,
+                 thr->id, lane );
+         return true;
+     }
+     else
+          applog( LOG_WARNING, "Failed to submit share." );
+     return false;
+}
+
+
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -196,8 +196,9 @@ void four_way_not_tested();
 int null_scanhash();

 // The one and only, a callback for scanhash.
-
-
+bool submit_solution( struct work *work, void *hash,
+                      struct thr_info *thr, int lane );
+ 
 bool submit_work( struct thr_info *thr, const struct work *work_in );

 // displays warning
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,6 +1,39 @@
 #include "lyra2-gate.h"


+// huge pages
+//
+// Use MAP_PRIVATE instead
+// In register algo:
+// replace thread safe whole matrix with a char**
+// alloc huge pages matrixsize * threads
+// make pointers to each thread to each thread, creating an 
+// array[thread][matrix].
+// Each thread can create its own matrix pointer:
+//  my_matrix = the matrix + ( thread_id * matrix_size  )
+//
+// Compiler version check?
+// Fallback?
+//
+// create a generic utility to map & unmap huge pages.
+// ptr = malloc_huge( size );
+// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
+// it should be aligned to the page boundary. It may be desireable to
+// have the matrix size rounded up if necessary to something bigger
+// than 64 byte, say 4 kbytes a small page size.
+
+// Define some constants for indivual parameters and matrix size for
+// each algo. Use the parameter constants where apropriate.
+// Convert algos that don't yet do so to use dynamic alllocation.
+// Alloc huge pages globally. If ok each thread will create a pointer to
+// its chunk. If fail each thread will use use _mm_alloc for itself. 
+
+#define LYRA2REV3_NROWS 4
+#define LYRA2REV3_NCOLS 4
+//#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+//                                                 (LYRA2REV3_NROWS)*8)
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
+
 __thread uint64_t* l2v3_wholeMatrix;

 bool lyra2rev3_thread_init()
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -61,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );

+/*
+// SHA-256 7 way hybrid
+// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m128i  bufx[64>>2];
+   __m128i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_7way_context;
+
+void sha256_7way_init( sha256_7way_context *ctx );
+void sha256_7way( sha256_7way_context *ctx, const void *datax,
+                         void *datay, void *dataz, size_t len );
+void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
+                         void *dstz  );
+*/
+
 #if defined (__AVX2__)

 // SHA-256 8 way
@@ -89,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
 void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-#endif
-#endif
-#endif
+// SHA-256 11 way hybrid
+// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+typedef struct {
+   __m256i  bufx[64>>2];
+   __m256i  valx[8];
+   __m64    bufy[64>>2];
+   __m64    valy[8];
+   uint32_t bufz[64>>2];
+   uint32_t valz[8];
+   uint32_t count_high, count_low;
+} sha256_11way_context;
+
+void sha256_11way_init( sha256_11way_context *ctx );
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                 const void *datay, const void *dataz, size_t len );
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
+	                 void *dstz  );
+
+#endif  // __AVX2__
+#endif  // __SSE2__
+#endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256_hash_11way.c
+++ b/algo/sha/sha256_hash_11way.c
@@ -0,0 +1,527 @@
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sha2-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// naming convention for variables and macros
+// VARx: AVX2 8 way 32 bit
+// VARy: MMX 2 way 32 bit
+// VARz: scalar integer 32 bit
+
+
+static const uint32_t H256[8] =
+{
+        0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+        0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t K256[64] = 
+{
+        0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+        0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+        0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+        0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+        0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+        0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+        0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+        0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+        0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+        0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+        0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+        0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+        0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+        0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+        0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+        0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define CHy(X, Y, Z) \
+   _mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
+
+#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
+
+
+#define MAJx(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define MAJy(X, Y, Z) \
+   _mm_or_si64( _mm_and_si64( X, Y ), \
+                    _mm_and_si64( _mm_or_si64( X, Y ), Z ) )
+
+#define MAJz(X, Y, Z)  ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
+
+#define BSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
+
+#define BSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
+
+#define BSG2_0z(x)  ( ror_32(x,2) ^ ror_32(x,13)  ^ ((x)>>22) )
+
+#define BSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
+
+#define BSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
+
+#define BSG2_1z(x)   ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
+
+#define SSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) ) 
+
+#define SSG2_0y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
+
+#define SSG2_0z(x)  (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
+
+#define SSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
+
+#define SSG2_1y(x) \
+   _mm_xor_si64( _mm_xor_si64( \
+       mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
+
+#define SSG2_1z(x)   ( ror_32(x,17) ^ ror_32(x,19)  ^ ((x)>>10) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+                 SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
+
+#define SHA2y_MEXP( a, b, c, d ) \
+     _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+                 SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
+
+#define SHA2z_MEXP( a, b, c, d ) \
+               ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
+
+
+#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
+	                  Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
+		          Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
+do { \
+  __m256i T1x, T2x; \
+  __m64 T1y, T2y; \
+  uint32_t T1z, T2z; \
+  T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+        _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
+                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
+  T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
+        _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
+                          _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
+  T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
+  T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
+  T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
+  T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
+  Dx  = _mm256_add_epi32( Dx,  T1x ); \
+  Dy  = _mm_add_pi32( Dy, T1y ); \
+  Dz  = Dz + T1z; \
+  Hx  = _mm256_add_epi32( T1x, T2x ); \
+  Hy  = _mm_add_pi32( T1y, T2y ); \
+  Hz  = T1z + T2z; \
+} while (0)
+	
+void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
+                         uint32_t *inz, uint32_t rz[8] )
+{
+   __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
+   __m256i Wx[16];
+   __m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
+   __m64 Wy[16];
+   uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
+   uint32_t Wz[16];
+
+   Wx[ 0] = mm256_bswap_32( inx[ 0] );
+   Wy[ 0] =  mm64_bswap_32( iny[ 0] );
+   Wz[ 0] =       bswap_32( inz[ 0] );
+
+   Wx[ 1] = mm256_bswap_32( inx[ 1] );
+   Wy[ 1] =  mm64_bswap_32( iny[ 1] );
+   Wz[ 1] =       bswap_32( inz[ 1] );
+
+   Wx[ 2] = mm256_bswap_32( inx[ 2] );
+   Wy[ 2] =  mm64_bswap_32( iny[ 2] );
+   Wz[ 2] =       bswap_32( inz[ 2] );
+
+   Wx[ 3] = mm256_bswap_32( inx[ 3] );
+   Wy[ 3] =  mm64_bswap_32( iny[ 3] );
+   Wz[ 3] =       bswap_32( inz[ 3] );
+
+   Wx[ 4] = mm256_bswap_32( inx[ 4] );
+   Wy[ 4] =  mm64_bswap_32( iny[ 4] );
+   Wz[ 4] =       bswap_32( inz[ 4] );
+
+   Wx[ 5] = mm256_bswap_32( inx[ 5] );
+   Wy[ 5] =  mm64_bswap_32( iny[ 5] );
+   Wz[ 5] =       bswap_32( inz[ 5] );
+
+   Wx[ 6] = mm256_bswap_32( inx[ 6] );
+   Wy[ 6] =  mm64_bswap_32( iny[ 6] );
+   Wz[ 6] =       bswap_32( inz[ 6] );
+
+   Wx[ 7] = mm256_bswap_32( inx[ 7] );
+   Wy[ 7] =  mm64_bswap_32( iny[ 7] );
+   Wz[ 7] =       bswap_32( inz[ 7] );
+
+   Wx[ 8] = mm256_bswap_32( inx[ 8] );
+   Wy[ 8] =  mm64_bswap_32( iny[ 8] );
+   Wz[ 8] =       bswap_32( inz[ 8] );
+
+   Wx[ 9] = mm256_bswap_32( inx[ 9] );
+   Wy[ 9] =  mm64_bswap_32( iny[ 9] );
+   Wz[ 9] =       bswap_32( inz[ 9] );
+
+   Wx[10] = mm256_bswap_32( inx[10] );
+   Wy[10] =  mm64_bswap_32( iny[10] );
+   Wz[10] =       bswap_32( inz[10] );
+
+   Wx[11] = mm256_bswap_32( inx[11] );
+   Wy[11] =  mm64_bswap_32( iny[11] );
+   Wz[11] =       bswap_32( inz[11] );
+
+   Wx[12] = mm256_bswap_32( inx[12] );
+   Wy[12] =  mm64_bswap_32( iny[12] );
+   Wz[12] =       bswap_32( inz[12] );
+
+   Wx[13] = mm256_bswap_32( inx[13] );
+   Wy[13] =  mm64_bswap_32( iny[13] );
+   Wz[13] =       bswap_32( inz[13] );
+
+   Wx[14] = mm256_bswap_32( inx[14] );
+   Wy[14] =  mm64_bswap_32( iny[14] );
+   Wz[14] =       bswap_32( inz[14] );
+
+   Wx[15] = mm256_bswap_32( inx[15] );
+   Wy[15] =  mm64_bswap_32( iny[15] );
+   Wz[15] =       bswap_32( inz[15] );
+
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, 0 );
+   SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+		     Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+		     Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, 0 );
+   SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		     Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		     Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, 0 );
+   SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		     Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		     Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
+   SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		     Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		     Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
+   SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		     Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		     Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
+   SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		     Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		     Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
+   SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		     Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		     Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
+   SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		     By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		     Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      Wx[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      Wy[ 0] = SHA2y_MEXP( 14,  9,  1,  0 );
+      Wz[ 0] = SHA2z_MEXP( 14,  9,  1,  0 );
+
+      Wx[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      Wy[ 1] = SHA2y_MEXP( 15, 10,  2,  1 );
+      Wz[ 1] = SHA2z_MEXP( 15, 10,  2,  1 );
+
+      Wx[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      Wy[ 2] = SHA2y_MEXP(  0, 11,  3,  2 );
+      Wz[ 2] = SHA2z_MEXP(  0, 11,  3,  2 );
+
+      Wx[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      Wy[ 3] = SHA2y_MEXP(  1, 12,  4,  3 );
+      Wz[ 3] = SHA2z_MEXP(  1, 12,  4,  3 );
+
+      Wx[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      Wy[ 4] = SHA2y_MEXP(  2, 13,  5,  4 );
+      Wz[ 4] = SHA2z_MEXP(  2, 13,  5,  4 );
+
+      Wx[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      Wy[ 5] = SHA2y_MEXP(  3, 14,  6,  5 );
+      Wz[ 5] = SHA2z_MEXP(  3, 14,  6,  5 );
+
+      Wx[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      Wy[ 6] = SHA2y_MEXP(  4, 15,  7,  6 );
+      Wz[ 6] = SHA2z_MEXP(  4, 15,  7,  6 );
+
+      Wx[ 7] = SHA2x_MEXP(  5,  0,  8,  7);
+      Wy[ 7] = SHA2y_MEXP(  5,  0,  8,  7);
+      Wz[ 7] = SHA2z_MEXP(  5,  0,  8,  7);
+
+      Wx[ 8] = SHA2x_MEXP(  6,  1,  9,  8);
+      Wy[ 8] = SHA2y_MEXP(  6,  1,  9,  8);
+      Wz[ 8] = SHA2z_MEXP(  6,  1,  9,  8);
+
+      Wx[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      Wy[ 9] = SHA2y_MEXP(  7,  2, 10,  9);
+      Wz[ 9] = SHA2z_MEXP(  7,  2, 10,  9);
+
+      Wx[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      Wy[10] = SHA2y_MEXP(  8,  3, 11, 10);
+      Wz[10] = SHA2z_MEXP(  8,  3, 11, 10);
+
+      Wx[11] = SHA2x_MEXP(  9,  4, 12, 11);
+      Wy[11] = SHA2y_MEXP(  9,  4, 12, 11);
+      Wz[11] = SHA2z_MEXP(  9,  4, 12, 11 );
+
+      Wx[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      Wy[12] = SHA2y_MEXP( 10,  5, 13, 12 );
+      Wz[12] = SHA2z_MEXP( 10,  5, 13, 12 );
+
+      Wx[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      Wy[13] = SHA2y_MEXP( 11,  6, 14, 13 );
+      Wz[13] = SHA2z_MEXP( 11,  6, 14, 13 );
+
+      Wx[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      Wy[14] = SHA2y_MEXP( 12,  7, 15, 14 );
+      Wz[14] = SHA2z_MEXP( 12,  7, 15, 14 );
+
+      Wx[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+      Wy[15] = SHA2y_MEXP( 13,  8,  0, 15 );
+      Wz[15] = SHA2z_MEXP( 13,  8,  0, 15 );
+
+
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+			Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,	 0, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
+		        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
+		       	Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  1, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
+		        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
+		       	Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz,  2, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
+		        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
+		       	Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez,  3, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
+		        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
+		       	Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz,  4, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
+		        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
+		       	Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz,  5, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
+		        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
+		       	Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz,  6, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
+		        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
+		       	Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az,  7, j );
+      SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
+                        Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
+                        Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  8, j );
+      SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, 
+                        Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, 
+                        Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz,  9, j );
+      SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, 
+                        Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, 
+                        Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
+      SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, 
+                        Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, 
+                        Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
+      SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, 
+                        Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, 
+                        Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
+      SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, 
+                        Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, 
+                        Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
+      SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, 
+                        Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, 
+                        Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
+      SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, 
+                        By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, 
+                        Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
+   }
+
+   rx[0] = _mm256_add_epi32( rx[0], Ax );
+   ry[0] =     _mm_add_pi32( ry[0], Ay );
+   rz[0] =                   rz[0]+ Az;
+   rx[1] = _mm256_add_epi32( rx[1], Bx );
+   ry[1] =     _mm_add_pi32( ry[1], By );
+   rz[1] =                   rz[1]+ Bz;
+   rx[2] = _mm256_add_epi32( rx[2], Cx );
+   ry[2] =     _mm_add_pi32( ry[2], Cy );
+   rz[3] =                   rz[3]+ Dz;
+   rx[4] = _mm256_add_epi32( rx[4], Ex );
+   ry[4] =     _mm_add_pi32( ry[4], Ey );
+   rz[4] =                   rz[4]+ Ez;
+   rx[5] = _mm256_add_epi32( rx[5], Fx );
+   ry[5] =     _mm_add_pi32( ry[5], Fy );
+   rz[5] =                   rz[5]+ Fz;
+   rx[6] = _mm256_add_epi32( rx[6], Gx );
+   ry[6] =     _mm_add_pi32( ry[6], Gy );
+   rz[6] =                   rz[6]+ Gz;
+   rx[7] = _mm256_add_epi32( rx[7], Hx );
+   ry[7] =     _mm_add_pi32( ry[7], Hy );
+   rz[7] =                   rz[7]+ Hz;
+
+}
+
+void sha256_11way_init( sha256_11way_context *ctx )
+{
+   ctx->count_high = ctx->count_low = 0;
+   ctx->valx[0] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[0] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[1] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[1] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[2] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[2] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[3] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[3] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[4] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[4] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[5] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[5] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[6] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[6] =     _mm_set1_pi32( H256[0] );
+   ctx->valx[7] = _mm256_set1_epi32( H256[0] );
+   ctx->valy[7] =     _mm_set1_pi32( H256[0] );
+   memcpy( ctx->valz, H256, 32 );
+}
+
+
+void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+	                  const void *datay, const void *dataz, size_t len )
+{
+   __m256i  *vdatax = (__m256i*) datax;
+    __m64   *vdatay = (__m64*)   datay;
+   uint32_t *idataz = (uint32_t*)dataz;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
+      memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
+      memcpy    ( ctx->bufz +  ptr,     idataz +  ptr,     clen    );
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         ptr = 0;
+      }
+      clow = ctx->count_low;
+      clow2 = clow + clen;
+      ctx->count_low = clow2;
+      if ( clow2 < clow )
+         ctx->count_high++;
+   }
+}
+
+
+void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
+	                                            void *dstz)
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)ctx->count_low & (buf_size - 1U);
+    ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
+    ctx->bufz[ ptr>>2 ] = 0x80;
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset_zero_64(  ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
+         memset(      ctx->bufz + (ptr>>2), 0,  (buf_size - ptr) >> 2 );
+         sha256_11way_round( ctx->bufx, ctx->valx,
+			     ctx->bufy, ctx->valy,
+			     ctx->bufz, ctx->valz );
+         memset_zero_256( ctx->bufx, pad >> 2 );
+         memset_zero_64(  ctx->bufy, pad >> 2 );
+         memset(      ctx->bufz, 0,  pad >> 2 );
+    }
+    else
+    {
+        memset_zero_256( ctx->bufx + (ptr>>2),    (pad - ptr) >> 2 );
+        memset_zero_64(  ctx->bufy + (ptr>>2),    (pad - ptr) >> 2 );
+        memset(          ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
+    }
+
+    low = ctx->count_low;
+    high = (ctx->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    ctx->bufx[ pad >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( high ) );
+    ctx->bufy[ pad >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( high ) );
+    ctx->bufz[ pad >> 2 ] =
+                 bswap_32( high );
+
+
+    ctx->bufx[ ( pad+4 ) >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( low ) );
+    ctx->bufy[ ( pad+4 ) >> 2 ] =
+                 mm64_bswap_32( _mm_set1_pi32( low ) );
+    ctx->bufz[ ( pad+4 ) >> 2 ] =
+                 bswap_32( low );
+
+    sha256_11way_round( ctx->bufx, ctx->valx,
+		       ctx->bufy, ctx->valy,
+		       ctx->bufz, ctx->valz  );
+
+    for ( u = 0; u < 8; u ++ )
+    {
+       casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
+       casti_m64  ( dsty, u ) =  mm64_bswap_32( ctx->valy[u] );
+       ((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
+   }
+}
+
+#endif
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,6 +5,136 @@
 #include <stdio.h>
 #include "sha2-hash-4way.h"

+#if defined(SHA256T_11WAY)
+
+static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
+
+void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
+	                 const void *inpy, const void*inpz )
+{
+   uint32_t hashx[8*8] __attribute__ ((aligned (64)));
+   uint32_t hashy[8*2] __attribute__ ((aligned (64)));
+   uint32_t hashz[8]   __attribute__ ((aligned (64)));
+   sha256_11way_context ctx;
+   const void *inpx64 = inpx+(64<<3);
+   const void *inpy64 = inpy+(64<<1);
+   const void *inpz64 = inpz+ 64;
+
+   memcpy( &ctx, &sha256_ctx11, sizeof ctx );
+   sha256_11way_update( &ctx, inpx64, inpy64, inpz64,  16 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, hashx, hashy, hashz );
+
+   sha256_11way_init( &ctx );
+   sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
+   sha256_11way_close( &ctx, outx, outy, outz );
+}
+
+int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
+	                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t datax[20*8]  __attribute__ ((aligned (64)));
+   uint32_t datay[20*2]  __attribute__ ((aligned (32)));
+   uint32_t dataz[20]    __attribute__ ((aligned (32)));
+   uint32_t hashx[8*8]   __attribute__ ((aligned (32)));
+   uint32_t hashy[8*2]   __attribute__ ((aligned (32)));
+   uint32_t hashz[8]     __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncex = (__m256i*) datax + 19;
+   __m64    *noncey = (__m64*)   datay + 19;
+   uint32_t *noncez = (uint32_t*)dataz + 19;
+   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int i;
+   const uint64_t htmax[] = {           0,
+                                      0xF,
+                                     0xFF,
+                                    0xFFF,
+                                   0xFFFF,
+                               0x10000000 };
+   const uint32_t masks[] = {  0xFFFFFFFF,
+                               0xFFFFFFF0,
+                               0xFFFFFF00,
+                               0xFFFFF000,
+                               0xFFFF0000,
+                                        0 };
+
+   // Use dataz (scalar) to stage bswapped data for the vectors.
+   casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
+                                 dataz, dataz, dataz, dataz, 640 );
+   mm64_interleave_2x32( datay, dataz, dataz, 640 );
+
+   sha256_11way_init( &sha256_ctx11 );
+   sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
+
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+        *noncex = mm256_bswap_32(
+		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+        *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
+        *noncez = bswap_32( n+10 );
+
+       	pdata[19] = n;
+
+        sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
+
+        if ( opt_benchmark ) { n += 11; continue; }
+
+        hash7 = &(hashx[7<<3]); 
+        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
+        { 
+            // deinterleave hash for lane
+            mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+	       pdata[19] = n + i;
+               submit_solution( work, lane_hash, mythr, i );
+            }
+	}
+
+	hash7 = &(hashy[7<<1]);
+        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
+        {
+            mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
+  	    if ( fulltest( lane_hash, ptarget ) )
+            {
+               pdata[19] = n + 8 + i;
+               submit_solution( work, lane_hash, mythr, i+8 );
+            }
+	 }
+
+	 if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
+         {
+            pdata[19] = n+10;
+            submit_solution( work, hashz, mythr, 10 );
+         }
+         n += 11;
+
+      } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
+      break;
+   }
+    
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)

 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -29,7 +159,7 @@ void sha256t_8way_hash( void* output, const void* input )
 }

 int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -71,39 +201,31 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
      do
      {
        *noncev = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-
-	 pdata[19] = n;
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+         pdata[19] = n;

         sha256t_8way_hash( hash, vdata );

-         uint32_t *hash7 = &(hash[7<<3]); 
-	 
+         uint32_t *hash7 = &(hash[7<<3]);
+
         for ( int lane = 0; lane < 8; lane++ )
         if ( !( hash7[ lane ] & mask ) )
-         { 
+         {
            // deinterleave hash for lane
-	    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+            mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );

-	    if ( fulltest( lane_hash, ptarget ) )
+            if ( fulltest( lane_hash, ptarget ) )
            {
-	      pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
+              pdata[19] = n + lane;
+              submit_solution( work, lane_hash, mythr, lane );
 	    }
-	 }
+         }
         n += 8;

      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
      break;
   }
-    
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -189,22 +311,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
            if ( fulltest( lane_hash, ptarget ) )
            {
              pdata[19] = n + lane;
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-            }
+              submit_solution( work, lane_hash, mythr, lane );
+	    }
         }
-
 	 n += 4;

      } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
      break;
   }
-
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -2,7 +2,11 @@

 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_8WAY)
+#if defined(SHA256T_11WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256t_11way;
+    gate->hash       = (void*)&sha256t_11way_hash;
+#elif defined(SHA256T_8WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -6,18 +6,29 @@

 // Override multi way on ryzen, SHA is better.
 #if !defined(RYZEN_)
-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
  #define SHA256T_4WAY
 #endif
 #if defined(__AVX2__)
  #define SHA256T_8WAY
+//  #define SHA256T_11WAY
 #endif
 #endif

 bool register_sha256t_algo( algo_gate_t* gate );
 bool register_sha256q_algo( algo_gate_t* gate );

+#if defined(SHA256T_11WAY)
+
+void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
+	                 const void *inpy, const void *inpz );
+int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr );
+//void sha256q_8way_hash( void *output, const void *input );
+//int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
+//                            uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
 #if defined(SHA256T_8WAY)

 void sha256t_8way_hash( void *output, const void *input );
@@ -26,8 +37,9 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
 void sha256q_8way_hash( void *output, const void *input );
 int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif

-#elif defined(SHA256T_4WAY)
+#if defined(SHA256T_4WAY)

 void sha256t_4way_hash( void *output, const void *input );
 int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -35,7 +47,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
 void sha256q_4way_hash( void *output, const void *input );
 int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
-#else
+#endif

 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
@@ -46,5 +58,3 @@ int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

-#endif
-
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -99,7 +99,22 @@
 #include <memory.h>
 #include <stdbool.h>

-// 64 bit seems completely useless
+// First some integer stuff that mirrors the SIMD utilities
+
+#define ror_64( x, c ) (((x)>>(c)) | ((x)<<(64-(c))))
+#define rol_64( x, c ) (((x)<<(c)) | ((x)>>(64-(c))))
+#define ror_32( x, c ) (((x)>>(c)) | ((x)<<(32-(c))))
+#define rol_32( x, c ) (((x)<<(c)) | ((x)>>(32-(c))))
+#define bswap_64( x )  __builtin_bswap64(x)
+#define bswap_32( x )  __builtin_bswap32(x)
+
+// 128 bit integer
+
+typedef unsigned __int128 uint128_t;
+
+#define i128_neg1        (uint128_t)(-1LL)
+#define i128_hi64( x )   (uint64_t)( (uint128_t)(x) >> 64 )
+#define i128_lo64( x )   (uint64_t)( (uint128_t)(x) << 64 >> 64 )

 ////////////////////////////////////////////////////////////////
 //
@@ -108,11 +123,7 @@
 // There are rumours MMX wil be removed. Although casting with int64
 // works there is likely some overhead to move the data to An MMX register
 // and back.
-// Byte swap and rotation may be more efficient using an MMX shuffle
-// except that it won't compile due to a "target specific option mismatch"
-// with "inlining failed in call to always inline". MMX was designed for
-// 32 bit CPUs and might not work on 64 bit CPUs where the CPU has full
-// support for 64 bit operations without vectoring.  
+// Byte swap and rotation may be more efficient using an MMX shuffle.
 //
 // Universal 64 bit overlay
 union _m64v
@@ -165,6 +176,7 @@ typedef union _m64_v16 m64_v16;
 #define casti_m64(p,i) (((__m64*)(p))[(i)])


+
 // cast all arguments as the're likely uint64_t

 // Bitwise not: ~(a)
@@ -255,6 +267,12 @@ static inline void memset_zero_64( __m64 *src, int n )
 static inline void memset_64( __m64 *dst, const __m64 a,  int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }

+// The b is for broadcast, don't use in hybrid hash, interleave.
+static inline void mem_bcpy_32( __m64 *dst, const uint32_t src, int n )
+{
+   for ( int i = 0; i < n; i++ ) dst[i] = _mm_set1_pi32( src );
+}
+

 //////////////////////////////////////////////////////////////////
 //
@@ -1917,7 +1935,7 @@ do { \

 #endif   // AVX512F

-#if 1
+#if 0
 //////////////////////////////////////////////////
 //
 //   Compile test.
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.2'
-PACKAGE_STRING='cpuminer-opt 3.9.2'
+PACKAGE_VERSION='3.9.2.3'
+PACKAGE_STRING='cpuminer-opt 3.9.2.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.2.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.2
+cpuminer-opt configure 3.9.2.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.2, which was
+It was created by cpuminer-opt $as_me 3.9.2.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.2'
+ VERSION='3.9.2.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.2, which was
+This file was extended by cpuminer-opt $as_me 3.9.2.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.2
+cpuminer-opt config.status 3.9.2.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.2])
+AC_INIT([cpuminer-opt], [3.9.2.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -106,9 +106,11 @@ int opt_scrypt_n = 0;
 int opt_pluck_n = 128;
 int opt_n_threads = 0;
 #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-__int128_t opt_affinity = -1LL;
+#define AFFINITY_USES_UINT128 1
+uint128_t opt_affinity = i128_neg1;
 #else
-int64_t opt_affinity = -1LL;
+#define AFFINITY_USES_UINT128 0
+uint64_t opt_affinity = -1LL;
 #endif
 int opt_priority = 0;
 int num_cpus = 1;
@@ -245,12 +247,12 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
 //   DWORD last_error;

   if ( id == -1 )
-	success = SetProcessAffinityMask( GetCurrentProcess(), mask );
+	success = SetProcessAffinityMask( GetCurrentProcess(), (DWORD_PTR)&mask );

 // Are Windows CPU Groups supported?
 #if _WIN32_WINNT==0x0601
   else if ( num_cpugroups == 1 )
-	success = SetThreadAffinityMask( GetCurrentThread(), mask );
+	success = SetThreadAffinityMask( GetCurrentThread(), (DWORD_PTR)&mask );
   else
   {
 	// Find the correct cpu group
@@ -275,7 +277,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
   }
 #else
   else 
-        success = SetThreadAffinityMask( GetCurrentThread(), mask );
+        success = SetThreadAffinityMask( GetCurrentThread(), (DWORD_PTR)&mask );
 #endif

   if (!success)
@@ -1842,26 +1844,46 @@ static void *miner_thread( void *userdata )
   }
   else
 */
+
   if ( num_cpus > 1 )
   {
-      if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 ) 
-      {
+#if AFFINITY_USES_UINT128
+       if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
+       {
+         if ( opt_debug )
+            applog( LOG_DEBUG,
+	  	      "Binding thread %d to cpu %d (mask %016llx %016llx)",
+                      thr_id, thr_id % num_cpus,
+	       	      i128_hi64( i128_neg1 << (thr_id % num_cpus) ),
+		      i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
+         affine_to_cpu_mask( thr_id,
+                             (uint128_t)1LL << (thr_id % num_cpus) );
+
+       }
+#else
+       if ( (opt_affinity == -1LL) && opt_n_threads > 1 ) 
+       {
         if (opt_debug)
            applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
-                   thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
-#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
-         affine_to_cpu_mask( thr_id,
-                             (unsigned __int128)1LL << (thr_id % num_cpus) );
-#else
+                thr_id, thr_id % num_cpus, L << (thr_id % num_cpus)) ;
         affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
+       }
 #endif
-      }
-      else if (opt_affinity != -1)
+      else 
      {
+#if AFFINITY_USES_UINT128
         if (opt_debug)
-             applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
-                                 thr_id, opt_affinity);
-         affine_to_cpu_mask( thr_id, opt_affinity );
+             applog( LOG_DEBUG,
+                      "Binding thread %d to cpu mask %016llx %016llx",
+                      thr_id, i128_hi64( i128_neg1 << (thr_id % num_cpus) ), 
+                              i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
+#else
+         if (opt_debug)
+             applog( LOG_DEBUG,
+                      "Binding thread %d to cpu mask %016llx %016llx",
+                      thr_id, opt_affinity );
+#endif
+      affine_to_cpu_mask( thr_id, opt_affinity );
      }
   }

@@ -2897,13 +2919,21 @@ void parse_arg(int key, char *arg )
 		break;
 	case 1020:
 		p = strstr(arg, "0x");
-		if (p)
-			ul = strtoul(p, NULL, 16);
+		if ( p )
+			ul = strtoull( p, NULL, 16 );
 		else
-			ul = atol(arg);
-		if (ul > (1UL<<num_cpus)-1)
-			ul = -1;
-		opt_affinity = ul;
+			ul = atoll( arg );
+//		if ( ul > ( 1ULL << num_cpus ) - 1ULL )
+//			ul = -1LL;
+#if AFFINITY_USES_UINT128
+// replicate the low 64 bits to make a full 128 bit maski if there are more
+// than 64 CPUs, otherwise zero extend the upper half.
+                opt_affinity = (uint128_t)ul;
+                if ( num_cpus > 64 )
+                   opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
+#else
+                   opt_affinity = ul;
+#endif
 		break;
 	case 1021:
 		v = atoi(arg);
@@ -3387,6 +3417,8 @@ int main(int argc, char *argv[])
   if ( num_cpus != opt_n_threads )   
     applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
             num_cpus, opt_n_threads );
+
+// To be reviewed
   if ( opt_affinity != -1 )
   {
      if ( num_cpus > 64 )
--- a/interleave.h
+++ b/interleave.h
@@ -43,8 +43,127 @@
 //
 // AVX512: 4x128, 8x64, 16x32
 //
-// Interleaving and deinterleaving is done in blocks of 16*16, 32*32,
-// or 64*64 bytes for SSE2, AVX2 and AVX512 vectors respectively.
+// Interleaving and deinterleaving is done in blocks of 8*8, 16*16, 32*32,
+// or 64*64 bytes for MMX, SSE2, AVX2 and AVX512 vectors respectively.
+
+//////////////////////////////////////////////////////
+// 
+//          MMX 64 bit vectors
+
+#define mm64_put_32( s0, s1 ) \
+  _mm_set_pi32( *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
+
+#define mm64_get_32( s, i0, i1 ) \
+  _mm_set_pi32( ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
+
+// 1 MMX block, 8 bytes * 2 lanes
+static inline void mm64_interleave_2x32( void *d, const void *s0,
+                                         const void *s1, int len )
+{
+  casti_m64( d, 0 ) = mm64_put_32( s0    , s1     );
+  casti_m64( d, 1 ) = mm64_put_32( s0+  4, s1+  4 );
+  casti_m64( d, 2 ) = mm64_put_32( s0+  8, s1+  8 );
+  casti_m64( d, 3 ) = mm64_put_32( s0+ 12, s1+ 12 );
+  casti_m64( d, 4 ) = mm64_put_32( s0+ 16, s1+ 16 );
+  casti_m64( d, 5 ) = mm64_put_32( s0+ 20, s1+ 20 );
+  casti_m64( d, 6 ) = mm64_put_32( s0+ 24, s1+ 24 );
+  casti_m64( d, 7 ) = mm64_put_32( s0+ 28, s1+ 28 );
+
+  if ( len <= 256 ) return;
+
+  casti_m64( d, 8 ) = mm64_put_32( s0+ 32, s1+ 32 );
+  casti_m64( d, 9 ) = mm64_put_32( s0+ 36, s1+ 36 );
+  casti_m64( d,10 ) = mm64_put_32( s0+ 40, s1+ 40 );
+  casti_m64( d,11 ) = mm64_put_32( s0+ 44, s1+ 44 );
+  casti_m64( d,12 ) = mm64_put_32( s0+ 48, s1+ 48 );
+  casti_m64( d,13 ) = mm64_put_32( s0+ 52, s1+ 52 );
+  casti_m64( d,14 ) = mm64_put_32( s0+ 56, s1+ 56 );
+  casti_m64( d,15 ) = mm64_put_32( s0+ 60, s1+ 60 );
+
+  if ( len <= 512 ) return;
+
+  casti_m64( d,16 ) = mm64_put_32( s0+ 64, s1+ 64 );
+  casti_m64( d,17 ) = mm64_put_32( s0+ 68, s1+ 68 );
+  casti_m64( d,18 ) = mm64_put_32( s0+ 72, s1+ 72 );
+  casti_m64( d,19 ) = mm64_put_32( s0+ 76, s1+ 76 );
+
+  if ( len <= 640 ) return;
+
+  casti_m64( d,20 ) = mm64_put_32( s0+ 80, s1+ 80 );
+  casti_m64( d,21 ) = mm64_put_32( s0+ 84, s1+ 84 );
+  casti_m64( d,22 ) = mm64_put_32( s0+ 88, s1+ 88 );
+  casti_m64( d,23 ) = mm64_put_32( s0+ 92, s1+ 92 );
+  casti_m64( d,24 ) = mm64_put_32( s0+ 96, s1+ 96 );
+  casti_m64( d,25 ) = mm64_put_32( s0+100, s1+100 );
+  casti_m64( d,26 ) = mm64_put_32( s0+104, s1+104 );
+  casti_m64( d,27 ) = mm64_put_32( s0+108, s1+108 );
+  casti_m64( d,28 ) = mm64_put_32( s0+112, s1+112 );
+  casti_m64( d,29 ) = mm64_put_32( s0+116, s1+116 );
+  casti_m64( d,30 ) = mm64_put_32( s0+120, s1+120 );
+  casti_m64( d,31 ) = mm64_put_32( s0+124, s1+124 );
+}
+
+static inline void mm64_deinterleave_2x32( void *d00, void *d01,
+                                    const int n, const void *s, int len )
+{
+   casti_m64( d00,0 ) = mm64_get_32( s,  0,  2 );
+   casti_m64( d01,0 ) = mm64_get_32( s,  1,  3 );
+   casti_m64( d00,1 ) = mm64_get_32( s,  4,  6 );
+   casti_m64( d01,1 ) = mm64_get_32( s,  5,  7 );
+   casti_m64( d00,2 ) = mm64_get_32( s,  8, 10 );
+   casti_m64( d01,2 ) = mm64_get_32( s,  9, 11 );
+   casti_m64( d00,3 ) = mm64_get_32( s, 12, 14 );
+   casti_m64( d01,3 ) = mm64_get_32( s, 13, 15 );
+
+   if ( len <= 256 ) return;
+
+   casti_m64( d00,4 ) = mm64_get_32( s, 16, 18 );
+   casti_m64( d01,4 ) = mm64_get_32( s, 17, 19 );
+   casti_m64( d00,5 ) = mm64_get_32( s, 20, 22 );
+   casti_m64( d01,5 ) = mm64_get_32( s, 21, 23 );
+   casti_m64( d00,6 ) = mm64_get_32( s, 24, 26 );
+   casti_m64( d01,6 ) = mm64_get_32( s, 25, 27 );
+   casti_m64( d00,7 ) = mm64_get_32( s, 28, 30 );
+   casti_m64( d01,7 ) = mm64_get_32( s, 29, 31 );
+
+   if ( len <= 512 ) return;
+
+   casti_m64( d00,8 ) = mm64_get_32( s, 32, 34 );
+   casti_m64( d01,8 ) = mm64_get_32( s, 33, 35 );
+   casti_m64( d00,9 ) = mm64_get_32( s, 36, 38 );
+   casti_m64( d01,9 ) = mm64_get_32( s, 37, 39 );
+
+   if ( len <= 640 ) return;
+
+   casti_m64( d00,10 ) = mm64_get_32( s, 40, 42 );
+   casti_m64( d01,10 ) = mm64_get_32( s, 41, 43 );
+   casti_m64( d00,11 ) = mm64_get_32( s, 44, 46 );
+   casti_m64( d01,11 ) = mm64_get_32( s, 45, 47 );
+   casti_m64( d00,12 ) = mm64_get_32( s, 48, 50 );
+   casti_m64( d01,12 ) = mm64_get_32( s, 49, 51 );
+   casti_m64( d00,13 ) = mm64_get_32( s, 52, 54 );
+   casti_m64( d01,13 ) = mm64_get_32( s, 53, 55 );
+   casti_m64( d00,14 ) = mm64_get_32( s, 56, 58 );
+   casti_m64( d01,14 ) = mm64_get_32( s, 57, 59 );
+   casti_m64( d00,15 ) = mm64_get_32( s, 60, 62 );
+   casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
+}
+
+static inline void mm64_extract_lane_2x32( void *d, const void *s,
+                                         const int lane, const int bit_len )
+{
+  casti_m64( d, 0 ) = mm64_get_32( s, lane   , lane+ 4 );
+  casti_m64( d, 1 ) = mm64_get_32( s, lane+ 8, lane+12 );
+  casti_m64( d, 2 ) = mm64_get_32( s, lane+16, lane+20 );
+  casti_m64( d, 3 ) = mm64_get_32( s, lane+24, lane+28 );
+
+  if ( bit_len <= 256 ) return;
+  casti_m64( d, 4 ) = mm64_get_32( s, lane+32, lane+36 );
+  casti_m64( d, 5 ) = mm64_get_32( s, lane+40, lane+44 );
+  casti_m64( d, 6 ) = mm64_get_32( s, lane+48, lane+52 );
+  casti_m64( d, 7 ) = mm64_get_32( s, lane+56, lane+60 );
+  // bit_len == 512
+}


 ///////////////////////////////////////////////////////////////
Author	SHA1	Message	Date
Jay D Dee	1b0a5aadf6	v3.9.2.3	2019-06-05 12:20:04 -04:00
Jay D Dee	0a3c52810e	v3.9.2.2	2019-06-04 17:14:03 -04:00
Jay D Dee	4d4386a374	v3.9.2.1	2019-06-04 16:56:44 -04:00