mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.10.0
This commit is contained in:
@@ -59,7 +59,6 @@ extern "C"{
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
@@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst);
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
|
||||
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
@@ -321,7 +322,8 @@ do { \
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
// Blake-256 4 way
|
||||
// Not used
|
||||
#if 0
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
@@ -342,6 +344,8 @@ do { \
|
||||
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
@@ -359,7 +363,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m128i S0, S1, S2, S3; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
@@ -371,10 +374,6 @@ do { \
|
||||
H5 = casti_m128i( state->H, 5 ); \
|
||||
H6 = casti_m128i( state->H, 6 ); \
|
||||
H7 = casti_m128i( state->H, 7 ); \
|
||||
S0 = casti_m128i( state->S, 0 ); \
|
||||
S1 = casti_m128i( state->S, 1 ); \
|
||||
S2 = casti_m128i( state->S, 2 ); \
|
||||
S3 = casti_m128i( state->S, 3 ); \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -388,17 +387,13 @@ do { \
|
||||
casti_m128i( state->H, 5 ) = H5; \
|
||||
casti_m128i( state->H, 6 ) = H6; \
|
||||
casti_m128i( state->H, 7 ) = H7; \
|
||||
casti_m128i( state->S, 0 ) = S0; \
|
||||
casti_m128i( state->S, 1 ) = S1; \
|
||||
casti_m128i( state->S, 2 ) = S2; \
|
||||
casti_m128i( state->S, 3 ) = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
// not used
|
||||
|
||||
#if 0
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -441,6 +436,7 @@ do { \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@@ -508,10 +504,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
|
||||
V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
|
||||
VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
|
||||
VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
|
||||
V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m128_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
m128_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
@@ -538,14 +534,14 @@ do { \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = mm128_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm128_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm128_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm128_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm128_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm128_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm128_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm128_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
@@ -556,13 +552,13 @@ do { \
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
@@ -581,7 +577,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u32 T0, T1;
|
||||
|
||||
#define READ_STATE32_8WAY(state) \
|
||||
@@ -594,10 +589,6 @@ do { \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -612,10 +603,6 @@ do { \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -635,10 +622,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
|
||||
VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
|
||||
VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
|
||||
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
m256_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
@@ -682,14 +669,14 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -703,7 +690,6 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
__m128i zero = m128_zero;
|
||||
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -712,11 +698,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = zero;
|
||||
casti_m128i( ctx->S, 1 ) = zero;
|
||||
casti_m128i( ctx->S, 2 ) = zero;
|
||||
casti_m128i( ctx->S, 3 ) = zero;
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
@@ -824,7 +805,6 @@ static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -833,10 +813,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
|
||||
@@ -4,13 +4,59 @@
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
|
||||
@@ -1,15 +1,19 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -4,13 +4,21 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
@@ -33,6 +33,178 @@
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
static const uint8_t sigma[12][16] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m512_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
casti_m512i( out, 2 ) = ctx->h[2];
|
||||
casti_m512i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// G Mixing function.
|
||||
@@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = {
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
{
|
||||
const uint8_t sigma[12][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
int i;
|
||||
__m256i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
@@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( i = 0; i < 12; i++ )
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@@ -16,14 +14,34 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(64) typedef struct {
|
||||
ALIGN(128) typedef struct {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx __attribute__((aligned(64)));
|
||||
} blake2b_4way_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
|
||||
@@ -3,22 +3,72 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_8x32( output, output+ 32, output+ 64, output+ 96,
|
||||
output+128, output+160, output+192, output+224,
|
||||
vhash, 256 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
@@ -12,7 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -8,13 +8,26 @@
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
@@ -165,13 +165,13 @@ do { \
|
||||
//
|
||||
// Supported:
|
||||
// 64 + 16 bytes (blake2s with midstate optimization)
|
||||
// 80 bytes without midstate (blake2s without midstate optimization)
|
||||
// 80 bytes (blake2s without midstate optimization)
|
||||
// Any multiple of 64 bytes in one shot (x25x)
|
||||
//
|
||||
// Unsupported:
|
||||
// Stream of 64 byte blocks one at a time.
|
||||
//
|
||||
// use for part blocks or when streaming more data
|
||||
// Stream of full 64 byte blocks one at a time.
|
||||
|
||||
// use only when streaming more data or final block not full.
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
@@ -466,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
{
|
||||
__m512i m[16];
|
||||
__m512i v[16];
|
||||
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND16W( 0 );
|
||||
ROUND16W( 1 );
|
||||
ROUND16W( 2 );
|
||||
ROUND16W( 3 );
|
||||
ROUND16W( 4 );
|
||||
ROUND16W( 5 );
|
||||
ROUND16W( 6 );
|
||||
ROUND16W( 7 );
|
||||
ROUND16W( 8 );
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
{
|
||||
blake2s_nway_param P[1];
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = 0;
|
||||
P->fanout = 1;
|
||||
P->depth = 1;
|
||||
P->leaf_length = 0;
|
||||
*((uint64_t*)(P->node_offset)) = 0;
|
||||
P->node_depth = 0;
|
||||
P->inner_length = 0;
|
||||
memset( P->salt, 0, sizeof( P->salt ) );
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m512i *input = (__m512i*)in;
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
{
|
||||
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_16way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
|
||||
S->buflen += (size_t) inlen;
|
||||
input += ( inlen>>2 );
|
||||
inlen -= inlen;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
|
||||
{
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_512( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m512i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#if 0
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
|
||||
@@ -64,7 +64,7 @@ typedef struct __blake2s_nway_param
|
||||
ALIGN( 64 ) typedef struct __blake2s_4way_state
|
||||
{
|
||||
__m128i h[8];
|
||||
uint8_t buf[ 2 * BLAKE2S_BLOCKBYTES * 4 ];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -75,13 +75,16 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ 2 * BLAKE2S_BLOCKBYTES * 8 ];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -92,9 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
|
||||
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
// const void *input, uint64_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_16way_state ;
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user