This commit is contained in:
Jay D Dee
2019-06-05 12:20:04 -04:00
parent 0a3c52810e
commit 1b0a5aadf6
14 changed files with 419 additions and 148 deletions

View File

@@ -163,6 +163,7 @@ cpuminer_SOURCES = \
algo/sha/sph_sha2.c \ algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \ algo/sha/sph_sha2big.c \
algo/sha/sha2-hash-4way.c \ algo/sha/sha2-hash-4way.c \
algo/sha/sha256_hash_11way.c \
algo/sha/sha2.c \ algo/sha/sha2.c \
algo/sha/sha256t-gate.c \ algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \ algo/sha/sha256t-4way.c \

View File

@@ -38,7 +38,13 @@ supported.
Change Log Change Log
---------- ----------
v3.9.2.1 v3.9.2.3
Another cpu-affinity fix.
Disabled test code that fails to compile on some CPUs with limited
AVX512 capabilities.
v3.9.2.2
Fixed some day one cpu-affinity issues. Fixed some day one cpu-affinity issues.

View File

@@ -345,9 +345,9 @@ const char* const algo_alias_map[][2] =
{ NULL, NULL } { NULL, NULL }
}; };
// if arg is a valid alias for a known algo it is updated with the proper name. // if arg is a valid alias for a known algo it is updated with the proper
// No validation of the algo or alias is done, It is the responsinility of the // name. No validation of the algo or alias is done, It is the responsinility
// calling function to validate the algo after return. // of the calling function to validate the algo after return.
void get_algo_alias( char** algo_or_alias ) void get_algo_alias( char** algo_or_alias )
{ {
int i; int i;
@@ -362,3 +362,21 @@ void get_algo_alias( char** algo_or_alias )
#undef ALIAS #undef ALIAS
#undef PROPER #undef PROPER
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane )
{
work_set_target_ratio( work, hash );
if ( submit_work( thr, work ) )
{
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr->id, lane );
return true;
}
else
applog( LOG_WARNING, "Failed to submit share." );
return false;
}

View File

@@ -196,8 +196,9 @@ void four_way_not_tested();
int null_scanhash(); int null_scanhash();
// The one and only, a callback for scanhash. // The one and only, a callback for scanhash.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane );
bool submit_work( struct thr_info *thr, const struct work *work_in ); bool submit_work( struct thr_info *thr, const struct work *work_in );
// displays warning // displays warning

View File

@@ -122,12 +122,11 @@ typedef struct {
} sha256_11way_context; } sha256_11way_context;
void sha256_11way_init( sha256_11way_context *ctx ); void sha256_11way_init( sha256_11way_context *ctx );
void sha256_11way( sha256_11way_context *ctx, const void *datax, void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
void *datay, void *dataz, size_t len ); const void *datay, const void *dataz, size_t len );
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx, void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
void *dstz ); void *dstz );
#endif // __AVX2__
#endif #endif // __SSE2__
#endif #endif // SHA256_4WAY_H__
#endif

View File

@@ -9,7 +9,7 @@
// naming convention for variables and macros // naming convention for variables and macros
// VARx: AVX2 8 way 32 bit // VARx: AVX2 8 way 32 bit
// VARy: MMX 2 way 32 bit // VARy: MMX 2 way 32 bit
// VARz: 32 bit integer // VARz: scalar integer 32 bit
static const uint32_t H256[8] = static const uint32_t H256[8] =
@@ -18,7 +18,7 @@ static const uint32_t H256[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
}; };
static const uont32_t K256[64] = static const uint32_t K256[64] =
{ {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
@@ -57,29 +57,25 @@ static const uont32_t K256[64] =
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) ) #define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
#define BSG2_0x(x) \ #define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,2), mm256_ror_32(x,13) ), mm256_ror_32( x,22) ) mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
#define BSG2_0y(x) \ #define BSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \ _mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,2), mm64_ror_32(x,13) ), mm64_ror_32( x,22) ) mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
#define BSG2_0z(x) ( ( ror_32(x,2) ^ ror_32(x,13) ) ^ ror_32(x,22) )
#define BSG2_0z(x) ( ror_32(x,2) ^ ror_32(x,13) ^ ((x)>>22) )
#define BSG2_1x(x) \ #define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,6), mm256_ror_32(x,11) ), mm256_ror_32( x,25) ) mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
#define BSG2_1y(x) \ #define BSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \ _mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,6), mm64_ror_32(x,11) ), mm64_ror_32( x,25) ) mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
#define BSG2_1z(x) \
(mm256_ror_32(x,6) ^ mm256_ror_32(x,11) ^ mm256_ror_32( x,25) )
#define BSG2_1z(x) ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
#define SSG2_0x(x) \ #define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
@@ -87,30 +83,35 @@ static const uont32_t K256[64] =
#define SSG2_0y(x) \ #define SSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \ _mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm64_srli_pi32(x,3) ) mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
#define SSG2_0z(x) ( ror_32(x,7) ^ ror_32(x,18) ^ ((x)>>3) )
#define SSG2_0z(x) (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
#define SSG2_1x(x) \ #define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) ) mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
#define SSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
#define SSG2_1z(x) ( ror_32(x,17) ^ ror_32(x,19) ^ ((x)>>10) )
#define SHA2x_MEXP( a, b, c, d ) \ #define SHA2x_MEXP( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \ _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] ); SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
#define SHA2y_MEXP( a, b, c, d ) \ #define SHA2y_MEXP( a, b, c, d ) \
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] ); SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
#define SHA2z_MEXP( a, b, c, d ) \ #define SHA2z_MEXP( a, b, c, d ) \
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] ); ( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \ #define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \ Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
Ax, Bx, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \ Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
do { \ do { \
__m256i T1x, T2x; \ __m256i T1x, T2x; \
__m64 T1y, T2y; \ __m64 T1y, T2y; \
@@ -119,22 +120,22 @@ do { \
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \ _mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \ _mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \ T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
_mm_add_pi32( H, BSG2_1x(Ey) ), CHx(Ey, Fy, Gy) ), \ _mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \ _mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \ T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \ T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
T2y = _mm256_add_epi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \ T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \ \ T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
Dx = _mm256_add_epi32( Dx, T1x ); \ Dx = _mm256_add_epi32( Dx, T1x ); \
Dy = _mm256_add_epi32( Dy, T1y ); \ Dy = _mm_add_pi32( Dy, T1y ); \
Dz = Dz + T1z; \ Dz = Dz + T1z; \
Hx = _mm256_add_epi32( T1x, T2x ); \ Hx = _mm256_add_epi32( T1x, T2x ); \
Hy = _mm256_add_epi32( T1y, T2y ); \ Hy = _mm_add_pi32( T1y, T2y ); \
Hz = T1z + T2z; \ Hz = T1z + T2z; \
} while (0) } while (0)
sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8], void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
uint32_t inz, uint32_t *rz[8] ) uint32_t *inz, uint32_t rz[8] )
{ {
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx; __m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
__m256i Wx[16]; __m256i Wx[16];
@@ -169,43 +170,43 @@ sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8],
Wx[ 6] = mm256_bswap_32( inx[ 6] ); Wx[ 6] = mm256_bswap_32( inx[ 6] );
Wy[ 6] = mm64_bswap_32( iny[ 6] ); Wy[ 6] = mm64_bswap_32( iny[ 6] );
Wz[ 6] = bswap_32( inx[ 6] ); Wz[ 6] = bswap_32( inz[ 6] );
Wx[ 7] = mm256_bswap_32( inx[ 7] ); Wx[ 7] = mm256_bswap_32( inx[ 7] );
Wy[ 7] = mm64_bswap_32( iny[ 7] ); Wy[ 7] = mm64_bswap_32( iny[ 7] );
Wz[ 7] = bswap_32( inx[ 7] ); Wz[ 7] = bswap_32( inz[ 7] );
Wx[ 8] = mm256_bswap_32( inx[ 8] ); Wx[ 8] = mm256_bswap_32( inx[ 8] );
Wy[ 8] = mm64_bswap_32( iny[ 8] ); Wy[ 8] = mm64_bswap_32( iny[ 8] );
Wz[ 8] = bswap_32( inx[ 8] ); Wz[ 8] = bswap_32( inz[ 8] );
Wx[ 9] = mm256_bswap_32( inx[ 9] ); Wx[ 9] = mm256_bswap_32( inx[ 9] );
Wy[ 9] = mm64_bswap_32( iny[ 9] ); Wy[ 9] = mm64_bswap_32( iny[ 9] );
Wz[ 9] = bswap_32( inx[ 9] ); Wz[ 9] = bswap_32( inz[ 9] );
Wx[10] = mm256_bswap_32( inx[10] ); Wx[10] = mm256_bswap_32( inx[10] );
Wy[10] = mm64_bswap_32( iny[10] ); Wy[10] = mm64_bswap_32( iny[10] );
Wz[10] = bswap_32( inx[10] ); Wz[10] = bswap_32( inz[10] );
Wx[11] = mm256_bswap_32( inx[11] ); Wx[11] = mm256_bswap_32( inx[11] );
Wy[11] = mm64_bswap_32( iny[11] ); Wy[11] = mm64_bswap_32( iny[11] );
Wz[11] = bswap_32( inx[11] ); Wz[11] = bswap_32( inz[11] );
Wx[12] = mm256_bswap_32( inx[12] ); Wx[12] = mm256_bswap_32( inx[12] );
Wy[12] = mm64_bswap_32( iny[12] ); Wy[12] = mm64_bswap_32( iny[12] );
Wz[12] = bswap_32( inx[12] ); Wz[12] = bswap_32( inz[12] );
Wx[13] = mm256_bswap_32( inx[13] ); Wx[13] = mm256_bswap_32( inx[13] );
Wy[13] = mm64_bswap_32( iny[13] ); Wy[13] = mm64_bswap_32( iny[13] );
Wz[13] = bswap_32( inx[13] ); Wz[13] = bswap_32( inz[13] );
Wx[14] = mm256_bswap_32( inx[14] ); Wx[14] = mm256_bswap_32( inx[14] );
Wy[14] = mm64_bswap_32( iny[14] ); Wy[14] = mm64_bswap_32( iny[14] );
Wz[14] = bswap_32( inx[14] ); Wz[14] = bswap_32( inz[14] );
Wx[15] = mm256_bswap_32( inx[15] ); Wx[15] = mm256_bswap_32( inx[15] );
Wy[15] = mm64_bswap_32( iny[15] ); Wy[15] = mm64_bswap_32( iny[15] );
Wz[15] = bswap_32( inx[15] ); Wz[15] = bswap_32( inz[15] );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
@@ -325,52 +326,52 @@ sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8],
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, By, Cz, Dz, Ez, Fy, Gz, Hz, 0, j ); Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
HZ, Az, By, Cz, Dz, Ez, Fy, Gz, 1, j ); Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, HZ, Az, By, Cz, Dz, Ez, Fy, 2, j ); Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, HZ, Az, By, Cz, Dz, Ez, 3, j ); Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, HZ, Az, By, Cz, Dz, 4, j ); Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, HZ, Az, By, Cz, 5, j ); Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, HZ, Az, By, 6, j ); Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, HZ, Az, 7, j ); Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, By, Cz, Dz, Ez, Fy, Gz, Hz, 8, j ); Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx, SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
HZ, Az, By, Cz, Dz, Ez, Fy, Gz, 9, j ); Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx, SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, HZ, Az, By, Cz, Dz, Ez, Fy, 10, j ); Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex, SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, HZ, Az, By, Cz, Dz, Ez, 11, j ); Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx, SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, HZ, Az, By, Cz, Dz, 12, j ); Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx, SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, HZ, Az, By, Cz, 13, j ); Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx, SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, HZ, Az, By, 14, j ); Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax, SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, HZ, Az, 15, j ); Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
} }
rx[0] = _mm256_add_epi32( rx[0], Ax ); rx[0] = _mm256_add_epi32( rx[0], Ax );
@@ -384,7 +385,7 @@ sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8],
rz[3] = rz[3]+ Dz; rz[3] = rz[3]+ Dz;
rx[4] = _mm256_add_epi32( rx[4], Ex ); rx[4] = _mm256_add_epi32( rx[4], Ex );
ry[4] = _mm_add_pi32( ry[4], Ey ); ry[4] = _mm_add_pi32( ry[4], Ey );
rz[4] = rz[4], Ez; rz[4] = rz[4]+ Ez;
rx[5] = _mm256_add_epi32( rx[5], Fx ); rx[5] = _mm256_add_epi32( rx[5], Fx );
ry[5] = _mm_add_pi32( ry[5], Fy ); ry[5] = _mm_add_pi32( ry[5], Fy );
rz[5] = rz[5]+ Fz; rz[5] = rz[5]+ Fz;
@@ -397,7 +398,7 @@ sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8],
} }
void sha256_8way_init( sha256_11way_context *ctx ) void sha256_11way_init( sha256_11way_context *ctx )
{ {
ctx->count_high = ctx->count_low = 0; ctx->count_high = ctx->count_low = 0;
ctx->valx[0] = _mm256_set1_epi32( H256[0] ); ctx->valx[0] = _mm256_set1_epi32( H256[0] );
@@ -416,12 +417,12 @@ void sha256_8way_init( sha256_11way_context *ctx )
ctx->valy[6] = _mm_set1_pi32( H256[0] ); ctx->valy[6] = _mm_set1_pi32( H256[0] );
ctx->valx[7] = _mm256_set1_epi32( H256[0] ); ctx->valx[7] = _mm256_set1_epi32( H256[0] );
ctx->valy[7] = _mm_set1_pi32( H256[0] ); ctx->valy[7] = _mm_set1_pi32( H256[0] );
memscpy( ctx->valz, H256, 32 ); memcpy( ctx->valz, H256, 32 );
} }
void sha256_11way( sha256_11way_context *ctx, const void *datax, void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len ) const void *datay, const void *dataz, size_t len )
{ {
__m256i *vdatax = (__m256i*) datax; __m256i *vdatax = (__m256i*) datax;
__m64 *vdatay = (__m64*) datay; __m64 *vdatay = (__m64*) datay;
@@ -440,26 +441,26 @@ void sha256_11way( sha256_11way_context *ctx, const void *datax,
clen = len; clen = len;
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 ); memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 ); memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
memcpy ( ctx->bufz + ptr, sdataz + ptr, clen ); memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
ptr += clen; ptr += clen;
len -= clen; len -= clen;
if ( ptr == buf_size ) if ( ptr == buf_size )
{ {
sha256_11way_round( ctx->bufx, ctx->valx, sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy, ctx->bufy, ctx->valy,
ctx->bufz, ctx->valzx, ); ctx->bufz, ctx->valz );
ptr = 0; ptr = 0;
} }
clow = sc->count_low; clow = ctx->count_low;
clow2 = clow + clen; clow2 = clow + clen;
sc->count_low = clow2; ctx->count_low = clow2;
if ( clow2 < clow ) if ( clow2 < clow )
sc->count_high++; ctx->count_high++;
} }
} }
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void dsty, void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
void *dstz) void *dstz)
{ {
unsigned ptr, u; unsigned ptr, u;
@@ -487,9 +488,9 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void dsty,
} }
else else
{ {
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 ); memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
memset_zero_64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 ); memset_zero_64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0 (pad - ptr) >> 2 ); memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
} }
low = ctx->count_low; low = ctx->count_low;
@@ -511,9 +512,9 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void dsty,
ctx->bufz[ ( pad+4 ) >> 2 ] = ctx->bufz[ ( pad+4 ) >> 2 ] =
bswap_32( low ); bswap_32( low );
sha256_8way_round( ctx->bufx, ctx->valx, sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy, ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz, ); ctx->bufz, ctx->valz );
for ( u = 0; u < 8; u ++ ) for ( u = 0; u < 8; u ++ )
{ {
@@ -523,4 +524,4 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void dsty,
} }
} }
#endif

View File

@@ -5,6 +5,136 @@
#include <stdio.h> #include <stdio.h>
#include "sha2-hash-4way.h" #include "sha2-hash-4way.h"
#if defined(SHA256T_11WAY)
static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void*inpz )
{
uint32_t hashx[8*8] __attribute__ ((aligned (64)));
uint32_t hashy[8*2] __attribute__ ((aligned (64)));
uint32_t hashz[8] __attribute__ ((aligned (64)));
sha256_11way_context ctx;
const void *inpx64 = inpx+(64<<3);
const void *inpy64 = inpy+(64<<1);
const void *inpz64 = inpz+ 64;
memcpy( &ctx, &sha256_ctx11, sizeof ctx );
sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, outx, outy, outz );
}
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t datax[20*8] __attribute__ ((aligned (64)));
uint32_t datay[20*2] __attribute__ ((aligned (32)));
uint32_t dataz[20] __attribute__ ((aligned (32)));
uint32_t hashx[8*8] __attribute__ ((aligned (32)));
uint32_t hashy[8*2] __attribute__ ((aligned (32)));
uint32_t hashz[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncex = (__m256i*) datax + 19;
__m64 *noncey = (__m64*) datay + 19;
uint32_t *noncez = (uint32_t*)dataz + 19;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
int i;
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// Use dataz (scalar) to stage bswapped data for the vectors.
casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
dataz, dataz, dataz, dataz, 640 );
mm64_interleave_2x32( datay, dataz, dataz, 640 );
sha256_11way_init( &sha256_ctx11 );
sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncex = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
*noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
*noncez = bswap_32( n+10 );
pdata[19] = n;
sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
if ( opt_benchmark ) { n += 11; continue; }
hash7 = &(hashx[7<<3]);
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + i;
submit_solution( work, lane_hash, mythr, i );
}
}
hash7 = &(hashy[7<<1]);
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
{
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + 8 + i;
submit_solution( work, lane_hash, mythr, i+8 );
}
}
if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
{
pdata[19] = n+10;
submit_solution( work, hashz, mythr, 10 );
}
n += 11;
} while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined(SHA256T_8WAY) #if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -29,7 +159,7 @@ void sha256t_8way_hash( void* output, const void* input )
} }
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32))); uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -71,39 +201,31 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
do do
{ {
*noncev = mm256_bswap_32( *noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) ); _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
pdata[19] = n;
sha256t_8way_hash( hash, vdata ); sha256t_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]); uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ ) for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) ) if ( !( hash7[ lane ] & mask ) )
{ {
// deinterleave hash for lane // deinterleave hash for lane
uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64)));
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 ); mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) ) if ( fulltest( lane_hash, ptarget ) )
{ {
pdata[19] = n + lane; pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash ); submit_solution( work, lane_hash, mythr, lane );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
} }
} }
n += 8; n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart ); } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break; break;
} }
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;
} }
@@ -189,22 +311,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) ) if ( fulltest( lane_hash, ptarget ) )
{ {
pdata[19] = n + lane; pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash ); submit_solution( work, lane_hash, mythr, lane );
if ( submit_work( mythr, work ) ) }
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
} }
n += 4; n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart ); } while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break; break;
} }
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;
} }

View File

@@ -2,7 +2,11 @@
bool register_sha256t_algo( algo_gate_t* gate ) bool register_sha256t_algo( algo_gate_t* gate )
{ {
#if defined(SHA256T_8WAY) #if defined(SHA256T_11WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_11way;
gate->hash = (void*)&sha256t_11way_hash;
#elif defined(SHA256T_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_8way; gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash; gate->hash = (void*)&sha256t_8way_hash;

View File

@@ -6,18 +6,29 @@
// Override multi way on ryzen, SHA is better. // Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_) #if !defined(RYZEN_)
//#if defined(__SSE4_2__)
#if defined(__SSE2__) #if defined(__SSE2__)
#define SHA256T_4WAY #define SHA256T_4WAY
#endif #endif
#if defined(__AVX2__) #if defined(__AVX2__)
#define SHA256T_8WAY #define SHA256T_8WAY
// #define SHA256T_11WAY
#endif #endif
#endif #endif
bool register_sha256t_algo( algo_gate_t* gate ); bool register_sha256t_algo( algo_gate_t* gate );
bool register_sha256q_algo( algo_gate_t* gate ); bool register_sha256q_algo( algo_gate_t* gate );
#if defined(SHA256T_11WAY)
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void *inpz );
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void sha256q_8way_hash( void *output, const void *input );
//int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256T_8WAY) #if defined(SHA256T_8WAY)
void sha256t_8way_hash( void *output, const void *input ); void sha256t_8way_hash( void *output, const void *input );
@@ -26,8 +37,9 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
void sha256q_8way_hash( void *output, const void *input ); void sha256q_8way_hash( void *output, const void *input );
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
#endif
#elif defined(SHA256T_4WAY) #if defined(SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input ); void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -35,7 +47,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
void sha256q_4way_hash( void *output, const void *input ); void sha256q_4way_hash( void *output, const void *input );
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ); uint64_t *hashes_done, struct thr_info *mythr );
#else #endif
void sha256t_hash( void *output, const void *input ); void sha256t_hash( void *output, const void *input );
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
@@ -46,5 +58,3 @@ int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
#endif #endif
#endif

View File

@@ -101,14 +101,14 @@
// First some integer stuff that mirrors the SIMD utilities // First some integer stuff that mirrors the SIMD utilities
#define ROR_64( x, c ) ((x)>>(c) | ((x)<<(64-(c)))) #define ror_64( x, c ) (((x)>>(c)) | ((x)<<(64-(c))))
#define ROL_64( x, c ) ((x)<<(c) | ((x)>>(64-(c)))) #define rol_64( x, c ) (((x)<<(c)) | ((x)>>(64-(c))))
#define ROR_32( x, c ) ((x)>>(c) | ((x)<<(32-(c)))) #define ror_32( x, c ) (((x)>>(c)) | ((x)<<(32-(c))))
#define ROL_32( x, c ) ((x)<<(c) | ((x)>>(32-(c)))) #define rol_32( x, c ) (((x)<<(c)) | ((x)>>(32-(c))))
#define BSWAP_64( x ) __builtin_bswap64(x) #define bswap_64( x ) __builtin_bswap64(x)
#define BSWAP_32( x ) __builtin_bswap32(x) #define bswap_32( x ) __builtin_bswap32(x)
// __int128 // 128 bit integer
typedef unsigned __int128 uint128_t; typedef unsigned __int128 uint128_t;
@@ -123,11 +123,7 @@ typedef unsigned __int128 uint128_t;
// There are rumours MMX wil be removed. Although casting with int64 // There are rumours MMX wil be removed. Although casting with int64
// works there is likely some overhead to move the data to An MMX register // works there is likely some overhead to move the data to An MMX register
// and back. // and back.
// Byte swap and rotation may be more efficient using an MMX shuffle // Byte swap and rotation may be more efficient using an MMX shuffle.
// except that it won't compile due to a "target specific option mismatch"
// with "inlining failed in call to always inline". MMX was designed for
// 32 bit CPUs and might not work on 64 bit CPUs where the CPU has full
// support for 64 bit operations without vectoring.
// //
// Universal 64 bit overlay // Universal 64 bit overlay
union _m64v union _m64v
@@ -1939,7 +1935,7 @@ do { \
#endif // AVX512F #endif // AVX512F
#if 1 #if 0
////////////////////////////////////////////////// //////////////////////////////////////////////////
// //
// Compile test. // Compile test.

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.2. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.3.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.2.2' PACKAGE_VERSION='3.9.2.3'
PACKAGE_STRING='cpuminer-opt 3.9.2.2' PACKAGE_STRING='cpuminer-opt 3.9.2.3'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.2.2 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.9.2.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.2:";; short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.3:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.9.2.2 cpuminer-opt configure 3.9.2.3
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.2.2, which was It was created by cpuminer-opt $as_me 3.9.2.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.9.2.2' VERSION='3.9.2.3'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.9.2.2, which was This file was extended by cpuminer-opt $as_me 3.9.2.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.9.2.2 cpuminer-opt config.status 3.9.2.3
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.2.2]) AC_INIT([cpuminer-opt], [3.9.2.3])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -2926,11 +2926,13 @@ void parse_arg(int key, char *arg )
// if ( ul > ( 1ULL << num_cpus ) - 1ULL ) // if ( ul > ( 1ULL << num_cpus ) - 1ULL )
// ul = -1LL; // ul = -1LL;
#if AFFINITY_USES_UINT128 #if AFFINITY_USES_UINT128
// replicate the low 64 bits to make a full 128 bit mask // replicate the low 64 bits to make a full 128 bit maski if there are more
opt_affinity = (uint128_t)(ul); // than 64 CPUs, otherwise zero extend the upper half.
opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul; opt_affinity = (uint128_t)ul;
if ( num_cpus > 64 )
opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
#else #else
opt_affinity = ul; opt_affinity = ul;
#endif #endif
break; break;
case 1021: case 1021:

View File

@@ -43,8 +43,127 @@
// //
// AVX512: 4x128, 8x64, 16x32 // AVX512: 4x128, 8x64, 16x32
// //
// Interleaving and deinterleaving is done in blocks of 16*16, 32*32, // Interleaving and deinterleaving is done in blocks of 8*8, 16*16, 32*32,
// or 64*64 bytes for SSE2, AVX2 and AVX512 vectors respectively. // or 64*64 bytes for MMX, SSE2, AVX2 and AVX512 vectors respectively.
//////////////////////////////////////////////////////
//
// MMX 64 bit vectors
#define mm64_put_32( s0, s1 ) \
_mm_set_pi32( *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
#define mm64_get_32( s, i0, i1 ) \
_mm_set_pi32( ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
// 1 MMX block, 8 bytes * 2 lanes
static inline void mm64_interleave_2x32( void *d, const void *s0,
const void *s1, int len )
{
casti_m64( d, 0 ) = mm64_put_32( s0 , s1 );
casti_m64( d, 1 ) = mm64_put_32( s0+ 4, s1+ 4 );
casti_m64( d, 2 ) = mm64_put_32( s0+ 8, s1+ 8 );
casti_m64( d, 3 ) = mm64_put_32( s0+ 12, s1+ 12 );
casti_m64( d, 4 ) = mm64_put_32( s0+ 16, s1+ 16 );
casti_m64( d, 5 ) = mm64_put_32( s0+ 20, s1+ 20 );
casti_m64( d, 6 ) = mm64_put_32( s0+ 24, s1+ 24 );
casti_m64( d, 7 ) = mm64_put_32( s0+ 28, s1+ 28 );
if ( len <= 256 ) return;
casti_m64( d, 8 ) = mm64_put_32( s0+ 32, s1+ 32 );
casti_m64( d, 9 ) = mm64_put_32( s0+ 36, s1+ 36 );
casti_m64( d,10 ) = mm64_put_32( s0+ 40, s1+ 40 );
casti_m64( d,11 ) = mm64_put_32( s0+ 44, s1+ 44 );
casti_m64( d,12 ) = mm64_put_32( s0+ 48, s1+ 48 );
casti_m64( d,13 ) = mm64_put_32( s0+ 52, s1+ 52 );
casti_m64( d,14 ) = mm64_put_32( s0+ 56, s1+ 56 );
casti_m64( d,15 ) = mm64_put_32( s0+ 60, s1+ 60 );
if ( len <= 512 ) return;
casti_m64( d,16 ) = mm64_put_32( s0+ 64, s1+ 64 );
casti_m64( d,17 ) = mm64_put_32( s0+ 68, s1+ 68 );
casti_m64( d,18 ) = mm64_put_32( s0+ 72, s1+ 72 );
casti_m64( d,19 ) = mm64_put_32( s0+ 76, s1+ 76 );
if ( len <= 640 ) return;
casti_m64( d,20 ) = mm64_put_32( s0+ 80, s1+ 80 );
casti_m64( d,21 ) = mm64_put_32( s0+ 84, s1+ 84 );
casti_m64( d,22 ) = mm64_put_32( s0+ 88, s1+ 88 );
casti_m64( d,23 ) = mm64_put_32( s0+ 92, s1+ 92 );
casti_m64( d,24 ) = mm64_put_32( s0+ 96, s1+ 96 );
casti_m64( d,25 ) = mm64_put_32( s0+100, s1+100 );
casti_m64( d,26 ) = mm64_put_32( s0+104, s1+104 );
casti_m64( d,27 ) = mm64_put_32( s0+108, s1+108 );
casti_m64( d,28 ) = mm64_put_32( s0+112, s1+112 );
casti_m64( d,29 ) = mm64_put_32( s0+116, s1+116 );
casti_m64( d,30 ) = mm64_put_32( s0+120, s1+120 );
casti_m64( d,31 ) = mm64_put_32( s0+124, s1+124 );
}
static inline void mm64_deinterleave_2x32( void *d00, void *d01,
const int n, const void *s, int len )
{
casti_m64( d00,0 ) = mm64_get_32( s, 0, 2 );
casti_m64( d01,0 ) = mm64_get_32( s, 1, 3 );
casti_m64( d00,1 ) = mm64_get_32( s, 4, 6 );
casti_m64( d01,1 ) = mm64_get_32( s, 5, 7 );
casti_m64( d00,2 ) = mm64_get_32( s, 8, 10 );
casti_m64( d01,2 ) = mm64_get_32( s, 9, 11 );
casti_m64( d00,3 ) = mm64_get_32( s, 12, 14 );
casti_m64( d01,3 ) = mm64_get_32( s, 13, 15 );
if ( len <= 256 ) return;
casti_m64( d00,4 ) = mm64_get_32( s, 16, 18 );
casti_m64( d01,4 ) = mm64_get_32( s, 17, 19 );
casti_m64( d00,5 ) = mm64_get_32( s, 20, 22 );
casti_m64( d01,5 ) = mm64_get_32( s, 21, 23 );
casti_m64( d00,6 ) = mm64_get_32( s, 24, 26 );
casti_m64( d01,6 ) = mm64_get_32( s, 25, 27 );
casti_m64( d00,7 ) = mm64_get_32( s, 28, 30 );
casti_m64( d01,7 ) = mm64_get_32( s, 29, 31 );
if ( len <= 512 ) return;
casti_m64( d00,8 ) = mm64_get_32( s, 32, 34 );
casti_m64( d01,8 ) = mm64_get_32( s, 33, 35 );
casti_m64( d00,9 ) = mm64_get_32( s, 36, 38 );
casti_m64( d01,9 ) = mm64_get_32( s, 37, 39 );
if ( len <= 640 ) return;
casti_m64( d00,10 ) = mm64_get_32( s, 40, 42 );
casti_m64( d01,10 ) = mm64_get_32( s, 41, 43 );
casti_m64( d00,11 ) = mm64_get_32( s, 44, 46 );
casti_m64( d01,11 ) = mm64_get_32( s, 45, 47 );
casti_m64( d00,12 ) = mm64_get_32( s, 48, 50 );
casti_m64( d01,12 ) = mm64_get_32( s, 49, 51 );
casti_m64( d00,13 ) = mm64_get_32( s, 52, 54 );
casti_m64( d01,13 ) = mm64_get_32( s, 53, 55 );
casti_m64( d00,14 ) = mm64_get_32( s, 56, 58 );
casti_m64( d01,14 ) = mm64_get_32( s, 57, 59 );
casti_m64( d00,15 ) = mm64_get_32( s, 60, 62 );
casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
}
static inline void mm64_extract_lane_2x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m64( d, 0 ) = mm64_get_32( s, lane , lane+ 4 );
casti_m64( d, 1 ) = mm64_get_32( s, lane+ 8, lane+12 );
casti_m64( d, 2 ) = mm64_get_32( s, lane+16, lane+20 );
casti_m64( d, 3 ) = mm64_get_32( s, lane+24, lane+28 );
if ( bit_len <= 256 ) return;
casti_m64( d, 4 ) = mm64_get_32( s, lane+32, lane+36 );
casti_m64( d, 5 ) = mm64_get_32( s, lane+40, lane+44 );
casti_m64( d, 6 ) = mm64_get_32( s, lane+48, lane+52 );
casti_m64( d, 7 ) = mm64_get_32( s, lane+56, lane+60 );
// bit_len == 512
}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////