mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.9.2.1
This commit is contained in:
@@ -38,6 +38,10 @@ supported.
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.9.2.1
|
||||||
|
|
||||||
|
Fixed some day one cpu-affinity issues.
|
||||||
|
|
||||||
v3.9.2
|
v3.9.2
|
||||||
|
|
||||||
Added sha256q algo.
|
Added sha256q algo.
|
||||||
|
@@ -1,6 +1,39 @@
|
|||||||
#include "lyra2-gate.h"
|
#include "lyra2-gate.h"
|
||||||
|
|
||||||
|
|
||||||
|
// huge pages
|
||||||
|
//
|
||||||
|
// Use MAP_PRIVATE instead
|
||||||
|
// In register algo:
|
||||||
|
// replace thread safe whole matrix with a char**
|
||||||
|
// alloc huge pages matrixsize * threads
|
||||||
|
// make pointers to each thread to each thread, creating an
|
||||||
|
// array[thread][matrix].
|
||||||
|
// Each thread can create its own matrix pointer:
|
||||||
|
// my_matrix = the matrix + ( thread_id * matrix_size )
|
||||||
|
//
|
||||||
|
// Compiler version check?
|
||||||
|
// Fallback?
|
||||||
|
//
|
||||||
|
// create a generic utility to map & unmap huge pages.
|
||||||
|
// ptr = malloc_huge( size );
|
||||||
|
// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
|
||||||
|
// it should be aligned to the page boundary. It may be desireable to
|
||||||
|
// have the matrix size rounded up if necessary to something bigger
|
||||||
|
// than 64 byte, say 4 kbytes a small page size.
|
||||||
|
|
||||||
|
// Define some constants for indivual parameters and matrix size for
|
||||||
|
// each algo. Use the parameter constants where apropriate.
|
||||||
|
// Convert algos that don't yet do so to use dynamic alllocation.
|
||||||
|
// Alloc huge pages globally. If ok each thread will create a pointer to
|
||||||
|
// its chunk. If fail each thread will use use _mm_alloc for itself.
|
||||||
|
|
||||||
|
#define LYRA2REV3_NROWS 4
|
||||||
|
#define LYRA2REV3_NCOLS 4
|
||||||
|
//#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
|
||||||
|
// (LYRA2REV3_NROWS)*8)
|
||||||
|
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
|
||||||
|
|
||||||
__thread uint64_t* l2v3_wholeMatrix;
|
__thread uint64_t* l2v3_wholeMatrix;
|
||||||
|
|
||||||
bool lyra2rev3_thread_init()
|
bool lyra2rev3_thread_init()
|
||||||
|
@@ -61,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
|
|||||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
||||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||||
|
|
||||||
|
/*
|
||||||
|
// SHA-256 7 way hybrid
|
||||||
|
// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
|
||||||
|
typedef struct {
|
||||||
|
__m128i bufx[64>>2];
|
||||||
|
__m128i valx[8];
|
||||||
|
__m64 bufy[64>>2];
|
||||||
|
__m64 valy[8];
|
||||||
|
uint32_t bufz[64>>2];
|
||||||
|
uint32_t valz[8];
|
||||||
|
uint32_t count_high, count_low;
|
||||||
|
} sha256_7way_context;
|
||||||
|
|
||||||
|
void sha256_7way_init( sha256_7way_context *ctx );
|
||||||
|
void sha256_7way( sha256_7way_context *ctx, const void *datax,
|
||||||
|
void *datay, void *dataz, size_t len );
|
||||||
|
void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
|
||||||
|
void *dstz );
|
||||||
|
*/
|
||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
// SHA-256 8 way
|
// SHA-256 8 way
|
||||||
@@ -89,6 +109,25 @@ void sha512_4way_init( sha512_4way_context *sc);
|
|||||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
|
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
|
||||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||||
|
|
||||||
|
// SHA-256 11 way hybrid
|
||||||
|
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
|
||||||
|
typedef struct {
|
||||||
|
__m256i bufx[64>>2];
|
||||||
|
__m256i valx[8];
|
||||||
|
__m64 bufy[64>>2];
|
||||||
|
__m64 valy[8];
|
||||||
|
uint32_t bufz[64>>2];
|
||||||
|
uint32_t valz[8];
|
||||||
|
uint32_t count_high, count_low;
|
||||||
|
} sha256_11way_context;
|
||||||
|
|
||||||
|
void sha256_11way_init( sha256_11way_context *ctx );
|
||||||
|
void sha256_11way( sha256_11way_context *ctx, const void *datax,
|
||||||
|
void *datay, void *dataz, size_t len );
|
||||||
|
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
|
||||||
|
void *dstz );
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
526
algo/sha/sha256_hash_11wway.c
Normal file
526
algo/sha/sha256_hash_11wway.c
Normal file
@@ -0,0 +1,526 @@
|
|||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "sha2-hash-4way.h"
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
// naming convention for variables and macros
|
||||||
|
// VARx: AVX2 8 way 32 bit
|
||||||
|
// VARy: MMX 2 way 32 bit
|
||||||
|
// VARz: 32 bit integer
|
||||||
|
|
||||||
|
|
||||||
|
static const uint32_t H256[8] =
|
||||||
|
{
|
||||||
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||||
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uont32_t K256[64] =
|
||||||
|
{
|
||||||
|
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
|
||||||
|
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
|
||||||
|
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
|
||||||
|
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
|
||||||
|
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
|
||||||
|
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
|
||||||
|
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
|
||||||
|
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
|
||||||
|
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
|
||||||
|
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
|
||||||
|
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
|
||||||
|
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
|
||||||
|
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
|
||||||
|
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
|
||||||
|
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
|
||||||
|
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||||
|
};
|
||||||
|
|
||||||
|
#define CHx(X, Y, Z) \
|
||||||
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||||
|
|
||||||
|
#define CHy(X, Y, Z) \
|
||||||
|
_mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
|
||||||
|
|
||||||
|
#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
|
||||||
|
|
||||||
|
|
||||||
|
#define MAJx(X, Y, Z) \
|
||||||
|
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||||
|
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||||
|
|
||||||
|
#define MAJy(X, Y, Z) \
|
||||||
|
_mm_or_si64( _mm_and_si64( X, Y ), \
|
||||||
|
_mm_and_si64( _mm_or_si64( X, Y ), Z ) )
|
||||||
|
|
||||||
|
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
|
||||||
|
|
||||||
|
|
||||||
|
#define BSG2_0x(x) \
|
||||||
|
_mm256_xor_si256( _mm256_xor_si256( \
|
||||||
|
mm256_ror_32(x,2), mm256_ror_32(x,13) ), mm256_ror_32( x,22) )
|
||||||
|
|
||||||
|
#define BSG2_0y(x) \
|
||||||
|
_mm_xor_si64( _mm_xor_si64( \
|
||||||
|
mm64_ror_32(x,2), mm64_ror_32(x,13) ), mm64_ror_32( x,22) )
|
||||||
|
|
||||||
|
#define BSG2_0z(x) ( ( ror_32(x,2) ^ ror_32(x,13) ) ^ ror_32(x,22) )
|
||||||
|
|
||||||
|
|
||||||
|
#define BSG2_1x(x) \
|
||||||
|
_mm256_xor_si256( _mm256_xor_si256( \
|
||||||
|
mm256_ror_32(x,6), mm256_ror_32(x,11) ), mm256_ror_32( x,25) )
|
||||||
|
|
||||||
|
#define BSG2_1y(x) \
|
||||||
|
_mm_xor_si64( _mm_xor_si64( \
|
||||||
|
mm64_ror_32(x,6), mm64_ror_32(x,11) ), mm64_ror_32( x,25) )
|
||||||
|
|
||||||
|
#define BSG2_1z(x) \
|
||||||
|
(mm256_ror_32(x,6) ^ mm256_ror_32(x,11) ^ mm256_ror_32( x,25) )
|
||||||
|
|
||||||
|
|
||||||
|
#define SSG2_0x(x) \
|
||||||
|
_mm256_xor_si256( _mm256_xor_si256( \
|
||||||
|
mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) )
|
||||||
|
|
||||||
|
#define SSG2_0y(x) \
|
||||||
|
_mm_xor_si64( _mm_xor_si64( \
|
||||||
|
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm64_srli_pi32(x,3) )
|
||||||
|
|
||||||
|
#define SSG2_0z(x) ( ror_32(x,7) ^ ror_32(x,18) ^ ((x)>>3) )
|
||||||
|
|
||||||
|
|
||||||
|
#define SSG2_1x(x) \
|
||||||
|
_mm256_xor_si256( _mm256_xor_si256( \
|
||||||
|
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
|
||||||
|
|
||||||
|
#define SHA2x_MEXP( a, b, c, d ) \
|
||||||
|
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||||
|
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] );
|
||||||
|
|
||||||
|
#define SHA2y_MEXP( a, b, c, d ) \
|
||||||
|
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||||
|
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] );
|
||||||
|
|
||||||
|
#define SHA2z_MEXP( a, b, c, d ) \
|
||||||
|
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] );
|
||||||
|
|
||||||
|
|
||||||
|
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
|
||||||
|
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
|
||||||
|
Ax, Bx, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
|
||||||
|
do { \
|
||||||
|
__m256i T1x, T2x; \
|
||||||
|
__m64 T1y, T2y; \
|
||||||
|
uint32_t T1z, T2z; \
|
||||||
|
T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||||
|
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
|
||||||
|
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
|
||||||
|
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||||
|
_mm_add_pi32( H, BSG2_1x(Ey) ), CHx(Ey, Fy, Gy) ), \
|
||||||
|
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
|
||||||
|
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
|
||||||
|
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
|
||||||
|
T2y = _mm256_add_epi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
|
||||||
|
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \ \
|
||||||
|
Dx = _mm256_add_epi32( Dx, T1x ); \
|
||||||
|
Dy = _mm256_add_epi32( Dy, T1y ); \
|
||||||
|
Dz = Dz + T1z; \
|
||||||
|
Hx = _mm256_add_epi32( T1x, T2x ); \
|
||||||
|
Hy = _mm256_add_epi32( T1y, T2y ); \
|
||||||
|
Hz = T1z + T2z; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
sha256_8way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 *ry[8],
|
||||||
|
uint32_t inz, uint32_t *rz[8] )
|
||||||
|
{
|
||||||
|
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
|
||||||
|
__m256i Wx[16];
|
||||||
|
__m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
|
||||||
|
__m64 Wy[16];
|
||||||
|
uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
|
||||||
|
uint32_t Wz[16];
|
||||||
|
|
||||||
|
Wx[ 0] = mm256_bswap_32( inx[ 0] );
|
||||||
|
Wy[ 0] = mm64_bswap_32( iny[ 0] );
|
||||||
|
Wz[ 0] = bswap_32( inz[ 0] );
|
||||||
|
|
||||||
|
Wx[ 1] = mm256_bswap_32( inx[ 1] );
|
||||||
|
Wy[ 1] = mm64_bswap_32( iny[ 1] );
|
||||||
|
Wz[ 1] = bswap_32( inz[ 1] );
|
||||||
|
|
||||||
|
Wx[ 2] = mm256_bswap_32( inx[ 2] );
|
||||||
|
Wy[ 2] = mm64_bswap_32( iny[ 2] );
|
||||||
|
Wz[ 2] = bswap_32( inz[ 2] );
|
||||||
|
|
||||||
|
Wx[ 3] = mm256_bswap_32( inx[ 3] );
|
||||||
|
Wy[ 3] = mm64_bswap_32( iny[ 3] );
|
||||||
|
Wz[ 3] = bswap_32( inz[ 3] );
|
||||||
|
|
||||||
|
Wx[ 4] = mm256_bswap_32( inx[ 4] );
|
||||||
|
Wy[ 4] = mm64_bswap_32( iny[ 4] );
|
||||||
|
Wz[ 4] = bswap_32( inz[ 4] );
|
||||||
|
|
||||||
|
Wx[ 5] = mm256_bswap_32( inx[ 5] );
|
||||||
|
Wy[ 5] = mm64_bswap_32( iny[ 5] );
|
||||||
|
Wz[ 5] = bswap_32( inz[ 5] );
|
||||||
|
|
||||||
|
Wx[ 6] = mm256_bswap_32( inx[ 6] );
|
||||||
|
Wy[ 6] = mm64_bswap_32( iny[ 6] );
|
||||||
|
Wz[ 6] = bswap_32( inx[ 6] );
|
||||||
|
|
||||||
|
Wx[ 7] = mm256_bswap_32( inx[ 7] );
|
||||||
|
Wy[ 7] = mm64_bswap_32( iny[ 7] );
|
||||||
|
Wz[ 7] = bswap_32( inx[ 7] );
|
||||||
|
|
||||||
|
Wx[ 8] = mm256_bswap_32( inx[ 8] );
|
||||||
|
Wy[ 8] = mm64_bswap_32( iny[ 8] );
|
||||||
|
Wz[ 8] = bswap_32( inx[ 8] );
|
||||||
|
|
||||||
|
Wx[ 9] = mm256_bswap_32( inx[ 9] );
|
||||||
|
Wy[ 9] = mm64_bswap_32( iny[ 9] );
|
||||||
|
Wz[ 9] = bswap_32( inx[ 9] );
|
||||||
|
|
||||||
|
Wx[10] = mm256_bswap_32( inx[10] );
|
||||||
|
Wy[10] = mm64_bswap_32( iny[10] );
|
||||||
|
Wz[10] = bswap_32( inx[10] );
|
||||||
|
|
||||||
|
Wx[11] = mm256_bswap_32( inx[11] );
|
||||||
|
Wy[11] = mm64_bswap_32( iny[11] );
|
||||||
|
Wz[11] = bswap_32( inx[11] );
|
||||||
|
|
||||||
|
Wx[12] = mm256_bswap_32( inx[12] );
|
||||||
|
Wy[12] = mm64_bswap_32( iny[12] );
|
||||||
|
Wz[12] = bswap_32( inx[12] );
|
||||||
|
|
||||||
|
Wx[13] = mm256_bswap_32( inx[13] );
|
||||||
|
Wy[13] = mm64_bswap_32( iny[13] );
|
||||||
|
Wz[13] = bswap_32( inx[13] );
|
||||||
|
|
||||||
|
Wx[14] = mm256_bswap_32( inx[14] );
|
||||||
|
Wy[14] = mm64_bswap_32( iny[14] );
|
||||||
|
Wz[14] = bswap_32( inx[14] );
|
||||||
|
|
||||||
|
Wx[15] = mm256_bswap_32( inx[15] );
|
||||||
|
Wy[15] = mm64_bswap_32( iny[15] );
|
||||||
|
Wz[15] = bswap_32( inx[15] );
|
||||||
|
|
||||||
|
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||||
|
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||||
|
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||||
|
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||||
|
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||||
|
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||||
|
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||||
|
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||||
|
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||||
|
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||||
|
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||||
|
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||||
|
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||||
|
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||||
|
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||||
|
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||||
|
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||||
|
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||||
|
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||||
|
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||||
|
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||||
|
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||||
|
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||||
|
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||||
|
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||||
|
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||||
|
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||||
|
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||||
|
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||||
|
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||||
|
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
|
||||||
|
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||||
|
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||||
|
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
|
||||||
|
|
||||||
|
for ( int j = 16; j < 64; j += 16 )
|
||||||
|
{
|
||||||
|
Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
|
||||||
|
Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 );
|
||||||
|
Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 );
|
||||||
|
|
||||||
|
Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
|
||||||
|
Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 );
|
||||||
|
Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 );
|
||||||
|
|
||||||
|
Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
|
||||||
|
Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 );
|
||||||
|
Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 );
|
||||||
|
|
||||||
|
Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
|
||||||
|
Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 );
|
||||||
|
Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 );
|
||||||
|
|
||||||
|
Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
|
||||||
|
Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 );
|
||||||
|
Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 );
|
||||||
|
|
||||||
|
Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
|
||||||
|
Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 );
|
||||||
|
Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 );
|
||||||
|
|
||||||
|
Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
|
||||||
|
Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 );
|
||||||
|
Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 );
|
||||||
|
|
||||||
|
Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7);
|
||||||
|
Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7);
|
||||||
|
Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7);
|
||||||
|
|
||||||
|
Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8);
|
||||||
|
Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8);
|
||||||
|
Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8);
|
||||||
|
|
||||||
|
Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
|
||||||
|
Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9);
|
||||||
|
Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9);
|
||||||
|
|
||||||
|
Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 );
|
||||||
|
Wy[10] = SHA2y_MEXP( 8, 3, 11, 10);
|
||||||
|
Wz[10] = SHA2z_MEXP( 8, 3, 11, 10);
|
||||||
|
|
||||||
|
Wx[11] = SHA2x_MEXP( 9, 4, 12, 11);
|
||||||
|
Wy[11] = SHA2y_MEXP( 9, 4, 12, 11);
|
||||||
|
Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 );
|
||||||
|
|
||||||
|
Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 );
|
||||||
|
Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 );
|
||||||
|
Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 );
|
||||||
|
|
||||||
|
Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 );
|
||||||
|
Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 );
|
||||||
|
Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 );
|
||||||
|
|
||||||
|
Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 );
|
||||||
|
Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 );
|
||||||
|
Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 );
|
||||||
|
|
||||||
|
Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 );
|
||||||
|
Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 );
|
||||||
|
Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 );
|
||||||
|
|
||||||
|
|
||||||
|
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||||
|
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||||
|
Az, By, Cz, Dz, Ez, Fy, Gz, Hz, 0, j );
|
||||||
|
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||||
|
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||||
|
HZ, Az, By, Cz, Dz, Ez, Fy, Gz, 1, j );
|
||||||
|
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||||
|
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||||
|
Gz, HZ, Az, By, Cz, Dz, Ez, Fy, 2, j );
|
||||||
|
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||||
|
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||||
|
Fz, Gz, HZ, Az, By, Cz, Dz, Ez, 3, j );
|
||||||
|
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||||
|
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||||
|
Ez, Fz, Gz, HZ, Az, By, Cz, Dz, 4, j );
|
||||||
|
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||||
|
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||||
|
Dz, Ez, Fz, Gz, HZ, Az, By, Cz, 5, j );
|
||||||
|
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||||
|
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||||
|
Cz, Dz, Ez, Fz, Gz, HZ, Az, By, 6, j );
|
||||||
|
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||||
|
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||||
|
Bz, Cz, Dz, Ez, Fz, Gz, HZ, Az, 7, j );
|
||||||
|
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||||
|
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||||
|
Az, By, Cz, Dz, Ez, Fy, Gz, Hz, 8, j );
|
||||||
|
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||||
|
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||||
|
HZ, Az, By, Cz, Dz, Ez, Fy, Gz, 9, j );
|
||||||
|
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||||
|
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||||
|
Gz, HZ, Az, By, Cz, Dz, Ez, Fy, 10, j );
|
||||||
|
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||||
|
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||||
|
Fz, Gz, HZ, Az, By, Cz, Dz, Ez, 11, j );
|
||||||
|
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||||
|
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||||
|
Ez, Fz, Gz, HZ, Az, By, Cz, Dz, 12, j );
|
||||||
|
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||||
|
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||||
|
Dz, Ez, Fz, Gz, HZ, Az, By, Cz, 13, j );
|
||||||
|
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||||
|
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||||
|
Cz, Dz, Ez, Fz, Gz, HZ, Az, By, 14, j );
|
||||||
|
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||||
|
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||||
|
Bz, Cz, Dz, Ez, Fz, Gz, HZ, Az, 15, j );
|
||||||
|
}
|
||||||
|
|
||||||
|
rx[0] = _mm256_add_epi32( rx[0], Ax );
|
||||||
|
ry[0] = _mm_add_pi32( ry[0], Ay );
|
||||||
|
rz[0] = rz[0]+ Az;
|
||||||
|
rx[1] = _mm256_add_epi32( rx[1], Bx );
|
||||||
|
ry[1] = _mm_add_pi32( ry[1], By );
|
||||||
|
rz[1] = rz[1]+ Bz;
|
||||||
|
rx[2] = _mm256_add_epi32( rx[2], Cx );
|
||||||
|
ry[2] = _mm_add_pi32( ry[2], Cy );
|
||||||
|
rz[3] = rz[3]+ Dz;
|
||||||
|
rx[4] = _mm256_add_epi32( rx[4], Ex );
|
||||||
|
ry[4] = _mm_add_pi32( ry[4], Ey );
|
||||||
|
rz[4] = rz[4], Ez;
|
||||||
|
rx[5] = _mm256_add_epi32( rx[5], Fx );
|
||||||
|
ry[5] = _mm_add_pi32( ry[5], Fy );
|
||||||
|
rz[5] = rz[5]+ Fz;
|
||||||
|
rx[6] = _mm256_add_epi32( rx[6], Gx );
|
||||||
|
ry[6] = _mm_add_pi32( ry[6], Gy );
|
||||||
|
rz[6] = rz[6]+ Gz;
|
||||||
|
rx[7] = _mm256_add_epi32( rx[7], Hx );
|
||||||
|
ry[7] = _mm_add_pi32( ry[7], Hy );
|
||||||
|
rz[7] = rz[7]+ Hz;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha256_8way_init( sha256_11way_context *ctx )
|
||||||
|
{
|
||||||
|
ctx->count_high = ctx->count_low = 0;
|
||||||
|
ctx->valx[0] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[0] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[1] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[1] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[2] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[2] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[3] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[3] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[4] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[4] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[5] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[5] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[6] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[6] = _mm_set1_pi32( H256[0] );
|
||||||
|
ctx->valx[7] = _mm256_set1_epi32( H256[0] );
|
||||||
|
ctx->valy[7] = _mm_set1_pi32( H256[0] );
|
||||||
|
memscpy( ctx->valz, H256, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void sha256_11way( sha256_11way_context *ctx, const void *datax,
|
||||||
|
const void *datay, const void *dataz, size_t len )
|
||||||
|
{
|
||||||
|
__m256i *vdatax = (__m256i*) datax;
|
||||||
|
__m64 *vdatay = (__m64*) datay;
|
||||||
|
uint32_t *idataz = (uint32_t*)dataz;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 64;
|
||||||
|
|
||||||
|
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
uint32_t clow, clow2;
|
||||||
|
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if ( clen > len )
|
||||||
|
clen = len;
|
||||||
|
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
|
||||||
|
memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
|
||||||
|
memcpy ( ctx->bufz + ptr, sdataz + ptr, clen );
|
||||||
|
ptr += clen;
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||||
|
ctx->bufy, ctx->valy,
|
||||||
|
ctx->bufz, ctx->valzx, );
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
clow = sc->count_low;
|
||||||
|
clow2 = clow + clen;
|
||||||
|
sc->count_low = clow2;
|
||||||
|
if ( clow2 < clow )
|
||||||
|
sc->count_high++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void dsty,
|
||||||
|
void *dstz)
|
||||||
|
{
|
||||||
|
unsigned ptr, u;
|
||||||
|
uint32_t low, high;
|
||||||
|
const int buf_size = 64;
|
||||||
|
const int pad = buf_size - 8;
|
||||||
|
|
||||||
|
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||||
|
ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
|
||||||
|
ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
|
||||||
|
ctx->bufz[ ptr>>2 ] = 0x80;
|
||||||
|
ptr += 4;
|
||||||
|
|
||||||
|
if ( ptr > pad )
|
||||||
|
{
|
||||||
|
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
|
memset_zero_64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
|
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
|
||||||
|
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||||
|
ctx->bufy, ctx->valy,
|
||||||
|
ctx->bufz, ctx->valz );
|
||||||
|
memset_zero_256( ctx->bufx, pad >> 2 );
|
||||||
|
memset_zero_64( ctx->bufy, pad >> 2 );
|
||||||
|
memset( ctx->bufz, 0, pad >> 2 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
|
||||||
|
memset_zero_64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
|
||||||
|
memset( ctx->bufz + (ptr>>2), 0 (pad - ptr) >> 2 );
|
||||||
|
}
|
||||||
|
|
||||||
|
low = ctx->count_low;
|
||||||
|
high = (ctx->count_high << 3) | (low >> 29);
|
||||||
|
low = low << 3;
|
||||||
|
|
||||||
|
ctx->bufx[ pad >> 2 ] =
|
||||||
|
mm256_bswap_32( _mm256_set1_epi32( high ) );
|
||||||
|
ctx->bufy[ pad >> 2 ] =
|
||||||
|
mm64_bswap_32( _mm_set1_pi32( high ) );
|
||||||
|
ctx->bufz[ pad >> 2 ] =
|
||||||
|
bswap_32( high );
|
||||||
|
|
||||||
|
|
||||||
|
ctx->bufx[ ( pad+4 ) >> 2 ] =
|
||||||
|
mm256_bswap_32( _mm256_set1_epi32( low ) );
|
||||||
|
ctx->bufy[ ( pad+4 ) >> 2 ] =
|
||||||
|
mm64_bswap_32( _mm_set1_pi32( low ) );
|
||||||
|
ctx->bufz[ ( pad+4 ) >> 2 ] =
|
||||||
|
bswap_32( low );
|
||||||
|
|
||||||
|
sha256_8way_round( ctx->bufx, ctx->valx,
|
||||||
|
ctx->bufy, ctx->valy,
|
||||||
|
ctx->bufz, ctx->valz, );
|
||||||
|
|
||||||
|
for ( u = 0; u < 8; u ++ )
|
||||||
|
{
|
||||||
|
casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
|
||||||
|
casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] );
|
||||||
|
((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
24
avxdefs.h
24
avxdefs.h
@@ -99,7 +99,22 @@
|
|||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// 64 bit seems completely useless
|
// First some integer stuff that mirrors the SIMD utilities
|
||||||
|
|
||||||
|
#define ROR_64( x, c ) ((x)>>(c) | ((x)<<(64-(c))))
|
||||||
|
#define ROL_64( x, c ) ((x)<<(c) | ((x)>>(64-(c))))
|
||||||
|
#define ROR_32( x, c ) ((x)>>(c) | ((x)<<(32-(c))))
|
||||||
|
#define ROL_32( x, c ) ((x)<<(c) | ((x)>>(32-(c))))
|
||||||
|
#define BSWAP_64( x ) __builtin_bswap64(x)
|
||||||
|
#define BSWAP_32( x ) __builtin_bswap32(x)
|
||||||
|
|
||||||
|
// __int128
|
||||||
|
|
||||||
|
typedef unsigned __int128 uint128_t;
|
||||||
|
|
||||||
|
#define i128_neg1 (uint128_t)(-1LL)
|
||||||
|
#define i128_hi64( x ) (uint64_t)( (uint128_t)(x) >> 64 )
|
||||||
|
#define i128_lo64( x ) (uint64_t)( (uint128_t)(x) << 64 >> 64 )
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -165,6 +180,7 @@ typedef union _m64_v16 m64_v16;
|
|||||||
#define casti_m64(p,i) (((__m64*)(p))[(i)])
|
#define casti_m64(p,i) (((__m64*)(p))[(i)])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// cast all arguments as the're likely uint64_t
|
// cast all arguments as the're likely uint64_t
|
||||||
|
|
||||||
// Bitwise not: ~(a)
|
// Bitwise not: ~(a)
|
||||||
@@ -255,6 +271,12 @@ static inline void memset_zero_64( __m64 *src, int n )
|
|||||||
static inline void memset_64( __m64 *dst, const __m64 a, int n )
|
static inline void memset_64( __m64 *dst, const __m64 a, int n )
|
||||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||||
|
|
||||||
|
// The b is for broadcast, don't use in hybrid hash, interleave.
|
||||||
|
static inline void mem_bcpy_32( __m64 *dst, const uint32_t src, int n )
|
||||||
|
{
|
||||||
|
for ( int i = 0; i < n; i++ ) dst[i] = _mm_set1_pi32( src );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.
|
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.1.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='cpuminer-opt'
|
PACKAGE_NAME='cpuminer-opt'
|
||||||
PACKAGE_TARNAME='cpuminer-opt'
|
PACKAGE_TARNAME='cpuminer-opt'
|
||||||
PACKAGE_VERSION='3.9.2'
|
PACKAGE_VERSION='3.9.2.1'
|
||||||
PACKAGE_STRING='cpuminer-opt 3.9.2'
|
PACKAGE_STRING='cpuminer-opt 3.9.2.1'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures cpuminer-opt 3.9.2 to adapt to many kinds of systems.
|
\`configure' configures cpuminer-opt 3.9.2.1 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@@ -1404,7 +1404,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of cpuminer-opt 3.9.2:";;
|
short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.1:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@@ -1509,7 +1509,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
cpuminer-opt configure 3.9.2
|
cpuminer-opt configure 3.9.2.1
|
||||||
generated by GNU Autoconf 2.69
|
generated by GNU Autoconf 2.69
|
||||||
|
|
||||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by cpuminer-opt $as_me 3.9.2, which was
|
It was created by cpuminer-opt $as_me 3.9.2.1, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@@ -2993,7 +2993,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='cpuminer-opt'
|
PACKAGE='cpuminer-opt'
|
||||||
VERSION='3.9.2'
|
VERSION='3.9.2.1'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by cpuminer-opt $as_me 3.9.2, which was
|
This file was extended by cpuminer-opt $as_me 3.9.2.1, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
cpuminer-opt config.status 3.9.2
|
cpuminer-opt config.status 3.9.2.1
|
||||||
configured by $0, generated by GNU Autoconf 2.69,
|
configured by $0, generated by GNU Autoconf 2.69,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.9.2])
|
AC_INIT([cpuminer-opt], [3.9.2.1])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
76
cpu-miner.c
76
cpu-miner.c
@@ -106,9 +106,11 @@ int opt_scrypt_n = 0;
|
|||||||
int opt_pluck_n = 128;
|
int opt_pluck_n = 128;
|
||||||
int opt_n_threads = 0;
|
int opt_n_threads = 0;
|
||||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||||
__int128_t opt_affinity = -1LL;
|
#define AFFINITY_USES_UINT128 1
|
||||||
|
uint128_t opt_affinity = i128_neg1;
|
||||||
#else
|
#else
|
||||||
int64_t opt_affinity = -1LL;
|
#define AFFINITY_USES_UINT128 0
|
||||||
|
uint64_t opt_affinity = -1LL;
|
||||||
#endif
|
#endif
|
||||||
int opt_priority = 0;
|
int opt_priority = 0;
|
||||||
int num_cpus = 1;
|
int num_cpus = 1;
|
||||||
@@ -245,12 +247,12 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
|
|||||||
// DWORD last_error;
|
// DWORD last_error;
|
||||||
|
|
||||||
if ( id == -1 )
|
if ( id == -1 )
|
||||||
success = SetProcessAffinityMask( GetCurrentProcess(), mask );
|
success = SetProcessAffinityMask( GetCurrentProcess(), &mask );
|
||||||
|
|
||||||
// Are Windows CPU Groups supported?
|
// Are Windows CPU Groups supported?
|
||||||
#if _WIN32_WINNT==0x0601
|
#if _WIN32_WINNT==0x0601
|
||||||
else if ( num_cpugroups == 1 )
|
else if ( num_cpugroups == 1 )
|
||||||
success = SetThreadAffinityMask( GetCurrentThread(), mask );
|
success = SetThreadAffinityMask( GetCurrentThread(), &mask );
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Find the correct cpu group
|
// Find the correct cpu group
|
||||||
@@ -275,7 +277,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
else
|
else
|
||||||
success = SetThreadAffinityMask( GetCurrentThread(), mask );
|
success = SetThreadAffinityMask( GetCurrentThread(), &mask );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!success)
|
if (!success)
|
||||||
@@ -1842,26 +1844,46 @@ static void *miner_thread( void *userdata )
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if ( num_cpus > 1 )
|
if ( num_cpus > 1 )
|
||||||
{
|
{
|
||||||
if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 )
|
#if AFFINITY_USES_UINT128
|
||||||
{
|
if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
|
||||||
|
{
|
||||||
|
if ( opt_debug )
|
||||||
|
applog( LOG_DEBUG,
|
||||||
|
"Binding thread %d to cpu %d (mask %016llx %016llx)",
|
||||||
|
thr_id, thr_id % num_cpus,
|
||||||
|
i128_hi64( i128_neg1 << (thr_id % num_cpus) ),
|
||||||
|
i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
|
||||||
|
affine_to_cpu_mask( thr_id,
|
||||||
|
(uint128_t)1LL << (thr_id % num_cpus) );
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if ( (opt_affinity == -1LL) && opt_n_threads > 1 )
|
||||||
|
{
|
||||||
if (opt_debug)
|
if (opt_debug)
|
||||||
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
|
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
|
||||||
thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
|
thr_id, thr_id % num_cpus, L << (thr_id % num_cpus)) ;
|
||||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
|
||||||
affine_to_cpu_mask( thr_id,
|
|
||||||
(unsigned __int128)1LL << (thr_id % num_cpus) );
|
|
||||||
#else
|
|
||||||
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
|
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
else
|
||||||
else if (opt_affinity != -1)
|
|
||||||
{
|
{
|
||||||
|
#if AFFINITY_USES_UINT128
|
||||||
if (opt_debug)
|
if (opt_debug)
|
||||||
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
|
applog( LOG_DEBUG,
|
||||||
thr_id, opt_affinity);
|
"Binding thread %d to cpu mask %016llx %016llx",
|
||||||
affine_to_cpu_mask( thr_id, opt_affinity );
|
thr_id, i128_hi64( i128_neg1 << (thr_id % num_cpus) ),
|
||||||
|
i128_lo64( i128_neg1 << (thr_id % num_cpus) ) );
|
||||||
|
#else
|
||||||
|
if (opt_debug)
|
||||||
|
applog( LOG_DEBUG,
|
||||||
|
"Binding thread %d to cpu mask %016llx %016llx",
|
||||||
|
thr_id, opt_affinity );
|
||||||
|
#endif
|
||||||
|
affine_to_cpu_mask( thr_id, opt_affinity );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2897,13 +2919,19 @@ void parse_arg(int key, char *arg )
|
|||||||
break;
|
break;
|
||||||
case 1020:
|
case 1020:
|
||||||
p = strstr(arg, "0x");
|
p = strstr(arg, "0x");
|
||||||
if (p)
|
if ( p )
|
||||||
ul = strtoul(p, NULL, 16);
|
ul = strtoull( p, NULL, 16 );
|
||||||
else
|
else
|
||||||
ul = atol(arg);
|
ul = atoll( arg );
|
||||||
if (ul > (1UL<<num_cpus)-1)
|
// if ( ul > ( 1ULL << num_cpus ) - 1ULL )
|
||||||
ul = -1;
|
// ul = -1LL;
|
||||||
opt_affinity = ul;
|
#if AFFINITY_USES_UINT128
|
||||||
|
// replicate the low 64 bits to make a full 128 bit mask
|
||||||
|
opt_affinity = (uint128_t)(ul);
|
||||||
|
opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
|
||||||
|
#else
|
||||||
|
opt_affinity = ul;
|
||||||
|
#endif
|
||||||
break;
|
break;
|
||||||
case 1021:
|
case 1021:
|
||||||
v = atoi(arg);
|
v = atoi(arg);
|
||||||
@@ -3387,6 +3415,8 @@ int main(int argc, char *argv[])
|
|||||||
if ( num_cpus != opt_n_threads )
|
if ( num_cpus != opt_n_threads )
|
||||||
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
|
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
|
||||||
num_cpus, opt_n_threads );
|
num_cpus, opt_n_threads );
|
||||||
|
|
||||||
|
// To be reviewed
|
||||||
if ( opt_affinity != -1 )
|
if ( opt_affinity != -1 )
|
||||||
{
|
{
|
||||||
if ( num_cpus > 64 )
|
if ( num_cpus > 64 )
|
||||||
|
Reference in New Issue
Block a user