mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.4
This commit is contained in:
@@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
if (opt_benchmark)
|
||||
HTarget = 0x7f;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r14_4way_init( &blake_4w_ctx );
|
||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
||||
|
||||
|
@@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = {
|
||||
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V0 = mm128_shufll_32( V0 ); \
|
||||
V3 = mm128_swap_64( V3 ); \
|
||||
V2 = mm128_shuflr_32( V2 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
|
||||
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
|
||||
V0 = v128_shufll32( V0 ); \
|
||||
V3 = v128_swap64( V3 ); \
|
||||
V2 = v128_shuflr32( V2 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ), \
|
||||
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
|
||||
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ), \
|
||||
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V0 = mm128_shuflr_32( V0 ); \
|
||||
V3 = mm128_swap_64( V3 ); \
|
||||
V2 = mm128_shufll_32( V2 ); \
|
||||
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
|
||||
V0 = v128_shuflr32( V0 ); \
|
||||
V3 = v128_swap64( V3 ); \
|
||||
V2 = v128_shufll32( V2 ); \
|
||||
}
|
||||
|
||||
// Default is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds )
|
||||
{
|
||||
__m128i V0, V1, V2, V3;
|
||||
v128_t V0, V1, V2, V3;
|
||||
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
V0 = casti_m128i( H, 0 );
|
||||
V1 = casti_m128i( H, 1 );
|
||||
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
|
||||
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
|
||||
V0 = casti_v128( H, 0 );
|
||||
V1 = casti_v128( H, 1 );
|
||||
V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
|
||||
V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
|
||||
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
|
||||
M0 = buf[ 0];
|
||||
M1 = buf[ 1];
|
||||
@@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
BLAKE256_ROUND( 2 );
|
||||
BLAKE256_ROUND( 3 );
|
||||
}
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
|
||||
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) );
|
||||
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////
|
||||
@@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( v128_32( c1 ), m0 ) ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( v128_32( c0 ), m1 ) ); \
|
||||
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
a = v128_add32( v128_add32( a, b ), \
|
||||
v128_xor( v128_32( c1 ), m0 ) ); \
|
||||
d = v128_swap32_16( v128_xor( d, a ) ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 12 ); \
|
||||
a = v128_add32( v128_add32( a, b ), \
|
||||
v128_xor( v128_32( c0 ), m1 ) ); \
|
||||
d = v128_shuflr32_8( v128_xor( d, a ) ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4WAY(r) \
|
||||
@@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
}
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
H0 = casti_m128i( state->H, 0 ); \
|
||||
H1 = casti_m128i( state->H, 1 ); \
|
||||
H2 = casti_m128i( state->H, 2 ); \
|
||||
H3 = casti_m128i( state->H, 3 ); \
|
||||
H4 = casti_m128i( state->H, 4 ); \
|
||||
H5 = casti_m128i( state->H, 5 ); \
|
||||
H6 = casti_m128i( state->H, 6 ); \
|
||||
H7 = casti_m128i( state->H, 7 ); \
|
||||
H0 = casti_v128( state->H, 0 ); \
|
||||
H1 = casti_v128( state->H, 1 ); \
|
||||
H2 = casti_v128( state->H, 2 ); \
|
||||
H3 = casti_v128( state->H, 3 ); \
|
||||
H4 = casti_v128( state->H, 4 ); \
|
||||
H5 = casti_v128( state->H, 5 ); \
|
||||
H6 = casti_v128( state->H, 6 ); \
|
||||
H7 = casti_v128( state->H, 7 ); \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE32_4WAY(state) do { \
|
||||
casti_m128i( state->H, 0 ) = H0; \
|
||||
casti_m128i( state->H, 1 ) = H1; \
|
||||
casti_m128i( state->H, 2 ) = H2; \
|
||||
casti_m128i( state->H, 3 ) = H3; \
|
||||
casti_m128i( state->H, 4 ) = H4; \
|
||||
casti_m128i( state->H, 5 ) = H5; \
|
||||
casti_m128i( state->H, 6 ) = H6; \
|
||||
casti_m128i( state->H, 7 ) = H7; \
|
||||
casti_v128( state->H, 0 ) = H0; \
|
||||
casti_v128( state->H, 1 ) = H1; \
|
||||
casti_v128( state->H, 2 ) = H2; \
|
||||
casti_v128( state->H, 3 ) = H3; \
|
||||
casti_v128( state->H, 4 ) = H4; \
|
||||
casti_v128( state->H, 5 ) = H5; \
|
||||
casti_v128( state->H, 6 ) = H6; \
|
||||
casti_v128( state->H, 7 ) = H7; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
@@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
M0 = mm128_bswap_32( buf[0] ); \
|
||||
M1 = mm128_bswap_32( buf[1] ); \
|
||||
M2 = mm128_bswap_32( buf[2] ); \
|
||||
M3 = mm128_bswap_32( buf[3] ); \
|
||||
M4 = mm128_bswap_32( buf[4] ); \
|
||||
M5 = mm128_bswap_32( buf[5] ); \
|
||||
M6 = mm128_bswap_32( buf[6] ); \
|
||||
M7 = mm128_bswap_32( buf[7] ); \
|
||||
M8 = mm128_bswap_32( buf[8] ); \
|
||||
M9 = mm128_bswap_32( buf[9] ); \
|
||||
MA = mm128_bswap_32( buf[10] ); \
|
||||
MB = mm128_bswap_32( buf[11] ); \
|
||||
MC = mm128_bswap_32( buf[12] ); \
|
||||
MD = mm128_bswap_32( buf[13] ); \
|
||||
ME = mm128_bswap_32( buf[14] ); \
|
||||
MF = mm128_bswap_32( buf[15] ); \
|
||||
M0 = v128_bswap32( buf[0] ); \
|
||||
M1 = v128_bswap32( buf[1] ); \
|
||||
M2 = v128_bswap32( buf[2] ); \
|
||||
M3 = v128_bswap32( buf[3] ); \
|
||||
M4 = v128_bswap32( buf[4] ); \
|
||||
M5 = v128_bswap32( buf[5] ); \
|
||||
M6 = v128_bswap32( buf[6] ); \
|
||||
M7 = v128_bswap32( buf[7] ); \
|
||||
M8 = v128_bswap32( buf[8] ); \
|
||||
M9 = v128_bswap32( buf[9] ); \
|
||||
MA = v128_bswap32( buf[10] ); \
|
||||
MB = v128_bswap32( buf[11] ); \
|
||||
MC = v128_bswap32( buf[12] ); \
|
||||
MD = v128_bswap32( buf[13] ); \
|
||||
ME = v128_bswap32( buf[14] ); \
|
||||
MF = v128_bswap32( buf[15] ); \
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define COMPRESS32_4WAY( rounds ) \
|
||||
{ \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
v128_t V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
v128_t V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
|
||||
H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
|
||||
H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \
|
||||
H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \
|
||||
H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \
|
||||
H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \
|
||||
H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \
|
||||
H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -1867,14 +1867,14 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
|
||||
casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
|
||||
casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
||||
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
|
||||
casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
|
||||
casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
|
||||
casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
|
||||
casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
|
||||
casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
@@ -1884,7 +1884,7 @@ static void
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
v128_t *buf = (v128_t*)ctx->buf;
|
||||
size_t bptr = ctx->ptr<<2;
|
||||
size_t vptr = ctx->ptr >> 2;
|
||||
size_t blen = len << 2;
|
||||
@@ -1925,7 +1925,7 @@ static void
|
||||
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
v128_t buf[16] __attribute__ ((aligned (64)));
|
||||
size_t ptr = ctx->ptr;
|
||||
size_t vptr = ctx->ptr>>2;
|
||||
unsigned bit_len = ( (unsigned)ptr << 3 );
|
||||
@@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
|
||||
if ( vptr < 12 )
|
||||
{
|
||||
memset_zero_128( buf + vptr + 1, 13 - vptr );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
v128_memset_zero( buf + vptr + 1, 13 - vptr );
|
||||
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = v128_32( bswap_32( th ) );
|
||||
buf[ 15 ] = v128_32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
|
||||
v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_128( buf, 56>>2 );
|
||||
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
v128_memset_zero( buf, 56>>2 );
|
||||
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = v128_32( bswap_32( th ) );
|
||||
buf[ 15 ] = v128_32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
}
|
||||
|
||||
mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
|
||||
v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
@@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
v128_bswap32_80( endiandata, pdata );
|
||||
|
||||
do {
|
||||
endiandata[19] = n;
|
||||
|
@@ -12,13 +12,13 @@
|
||||
*/
|
||||
|
||||
#include "blake2s-hash.h"
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
/*
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
@@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
|
||||
S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block )
|
||||
{
|
||||
__m128i m[16];
|
||||
__m128i v[16];
|
||||
v128_t m[16];
|
||||
v128_t v[16];
|
||||
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
v128_memcpy( m, block, 16 );
|
||||
v128_memcpy( v, S->h, 8 );
|
||||
|
||||
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
|
||||
v[12] = v128_xor( v128_32( S->t[0] ),
|
||||
v128_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
|
||||
v[13] = v128_xor( v128_32( S->t[1] ),
|
||||
v128_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
|
||||
v[14] = v128_xor( v128_32( S->f[0] ),
|
||||
v128_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
|
||||
v[15] = v128_xor( v128_32( S->f[1] ),
|
||||
v128_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
|
||||
d = v128_swap32_16( v128_xor( d, a ) ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 12 ); \
|
||||
a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
|
||||
d = v128_shuflr32_8( v128_xor( d, a ) ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
@@ -143,7 +143,7 @@ do { \
|
||||
ROUND4W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] );
|
||||
|
||||
#undef G4W
|
||||
#undef ROUND4W
|
||||
@@ -175,26 +175,26 @@ do { \
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
v128_t *input = (v128_t*)in;
|
||||
v128_t *buf = (v128_t*)S->buf;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= BLAKE2S_BLOCKBYTES - left )
|
||||
if( inlen >= 64 - left )
|
||||
{
|
||||
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
|
||||
S->buflen += BLAKE2S_BLOCKBYTES - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 );
|
||||
S->buflen += 64 - left;
|
||||
S->t[0] += 64;
|
||||
S->t[1] += ( S->t[0] < 64 );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
input += ( 64 >> 2 );
|
||||
inlen -= 64;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
|
||||
v128_memcpy( buf + ( left>>2 ), input, inlen>>2 );
|
||||
S->buflen += (size_t) inlen;
|
||||
input += ( inlen>>2 );
|
||||
inlen -= inlen;
|
||||
@@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
|
||||
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
{
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
v128_t *buf = (v128_t*)S->buf;
|
||||
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
@@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_128( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
v128_memset_zero( buf + ( S->buflen>>2 ),
|
||||
( 64 - S->buflen ) >> 2 );
|
||||
blake2s_4way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m128i( out, i ) = S->h[ i ];
|
||||
casti_v128( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m128i *in = (__m128i*)input;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
v128_t *in = (v128_t*)input;
|
||||
v128_t *buf = (v128_t*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
while( inlen > 64 )
|
||||
{
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
v128_memcpy( buf, in, 64 >> 2 );
|
||||
S->buflen = 64;
|
||||
inlen -= 64;
|
||||
S->t[0] += 64;
|
||||
S->t[1] += ( S->t[0] < 64 );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
in += ( 64 >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
v128_memcpy( buf, in, 64 >> 2 );
|
||||
S->buflen = 64;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
@@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
blake2s_4way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m128i( out, i ) = S->h[ i ];
|
||||
casti_v128( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
{
|
||||
__m256i *input = (__m256i*)in;
|
||||
__m256i *buf = (__m256i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
const int bsize = 64;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
@@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
{
|
||||
memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
S->t[0] += 64;
|
||||
S->t[1] += ( S->t[0] < 64 );
|
||||
blake2s_8way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
@@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_256( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 );
|
||||
blake2s_8way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
@@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
__m256i *in = (__m256i*)input;
|
||||
__m256i *buf = (__m256i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
while( inlen > 64 )
|
||||
{
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
memcpy_256( buf, in, 64 >> 2 );
|
||||
S->buflen = 64;
|
||||
inlen -= 64;
|
||||
S->t[0] += 64;
|
||||
S->t[1] += ( S->t[0] < 64 );
|
||||
blake2s_8way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
in += ( 64 >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
memcpy_256( buf, in, 64 >> 2 );
|
||||
S->buflen = 64;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
@@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
{
|
||||
__m512i *input = (__m512i*)in;
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
const int bsize = 64;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
@@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
{
|
||||
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
S->t[0] += 64;
|
||||
S->t[1] += ( S->t[0] < 64 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
@@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_512( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
( 64 - S->buflen ) >> 2 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
|
@@ -14,7 +14,7 @@
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
@@ -29,41 +29,25 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
BLAKE2S_OUTBYTES = 32,
|
||||
BLAKE2S_KEYBYTES = 32,
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
|
||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||
} blake2s_nway_param;
|
||||
#pragma pack(pop)
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_nway_param;
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||
{
|
||||
__m128i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
||||
v128_t h[8];
|
||||
uint8_t buf[ 64 * 4 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
||||
uint8_t buf[ 32 * 8 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
uint8_t buf[ 32 * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#endif
|
||||
|
@@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input )
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_final( &ctx, output, 32 );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
@@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, 32 );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
@@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input )
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_final( &ctx, output, 32 );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, 32 );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
@@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input )
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_final( &ctx, output, 32 );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, 32 );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
@@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx;
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
|
||||
unsigned char _ALIGN(32) hash[32];
|
||||
blake2s_state ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
blake2s_final( &ctx, hash, 32 );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
@@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce,
|
||||
uint32_t _ALIGN(32) hash32[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
v128_bswap32_80( endiandata, pdata );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_init( &blake2s_ctx, 32 );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do
|
||||
|
@@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
||||
|
||||
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
|
||||
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
|
||||
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
|
||||
}
|
||||
|
||||
#define BLAKE512_ROUND( R ) \
|
||||
{ \
|
||||
__m128i V32, V23, V67, V76; \
|
||||
v128_t V32, V23, V67, V76; \
|
||||
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
V32 = v128_alignr64( V[3], V[2], 1 ); \
|
||||
V23 = v128_alignr64( V[2], V[3], 1 ); \
|
||||
V67 = v128_alignr64( V[6], V[7], 1 ); \
|
||||
V76 = v128_alignr64( V[7], V[6], 1 ); \
|
||||
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
|
||||
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
|
||||
V[2] = mm128_alignr_64( V32, V23, 1 ); \
|
||||
V[3] = mm128_alignr_64( V23, V32, 1 ); \
|
||||
V[6] = mm128_alignr_64( V76, V67, 1 ); \
|
||||
V[7] = mm128_alignr_64( V67, V76, 1 ); \
|
||||
V[2] = v128_alignr64( V32, V23, 1 ); \
|
||||
V[3] = v128_alignr64( V23, V32, 1 ); \
|
||||
V[6] = v128_alignr64( V76, V67, 1 ); \
|
||||
V[7] = v128_alignr64( V67, V76, 1 ); \
|
||||
}
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 )
|
||||
{
|
||||
__m128i V[8];
|
||||
v128_t V[8];
|
||||
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
|
||||
V[0] = casti_m128i( H, 0 );
|
||||
V[1] = casti_m128i( H, 1 );
|
||||
V[2] = casti_m128i( H, 2 );
|
||||
V[3] = casti_m128i( H, 3 );
|
||||
V[4] = _mm_set_epi64x( CB1, CB0 );
|
||||
V[5] = _mm_set_epi64x( CB3, CB2 );
|
||||
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
|
||||
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
|
||||
V[0] = casti_v128( H, 0 );
|
||||
V[1] = casti_v128( H, 1 );
|
||||
V[2] = casti_v128( H, 2 );
|
||||
V[3] = casti_v128( H, 3 );
|
||||
V[4] = v128_set_64( CB1, CB0 );
|
||||
V[5] = v128_set_64( CB3, CB2 );
|
||||
V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 );
|
||||
V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 );
|
||||
|
||||
M0 = bswap_64( buf[ 0] );
|
||||
M1 = bswap_64( buf[ 1] );
|
||||
@@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
|
||||
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
|
||||
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
|
||||
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) );
|
||||
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) );
|
||||
casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) );
|
||||
casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) );
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
VD = v512_64( T0 ^ CB5 ); \
|
||||
VE = v512_64( T1 ^ CB6 ); \
|
||||
VF = v512_64( T1 ^ CB7 ); \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
@@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
VE = v512_64( sc->T1 ^ CB6 );
|
||||
VF = v512_64( sc->T1 ^ CB7 );
|
||||
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
@@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst)
|
||||
VD = v256_64( T0 ^ CB5 ); \
|
||||
VE = v256_64( T1 ^ CB6 ); \
|
||||
VF = v256_64( T1 ^ CB7 ); \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
@@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
v256_64( CB6 ) );
|
||||
VF = _mm256_xor_si256( v256_64( sc->T1 ),
|
||||
v256_64( CB7 ) );
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
|
@@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||
|
||||
|
@@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
|
||||
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] )
|
||||
{
|
||||
memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
|
||||
memcpy( P->salt, salt, 8 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
|
||||
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] )
|
||||
{
|
||||
memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
|
||||
memcpy( P->personal, personal, 8 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
|
||||
blake2s_param P[1];
|
||||
|
||||
/* Move interval verification here? */
|
||||
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
|
||||
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = 0;
|
||||
@@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
|
||||
{
|
||||
blake2s_param P[1];
|
||||
|
||||
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
|
||||
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
|
||||
|
||||
if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
|
||||
if ( !key || !keylen || keylen > 8 ) return -1;
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = keylen;
|
||||
@@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
|
||||
if( blake2s_init_param( S, P ) < 0 ) return -1;
|
||||
|
||||
{
|
||||
uint8_t block[BLAKE2S_BLOCKBYTES];
|
||||
memset( block, 0, BLAKE2S_BLOCKBYTES );
|
||||
uint8_t block[64];
|
||||
memset( block, 0, 64 );
|
||||
memcpy( block, key, keylen );
|
||||
blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
|
||||
secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
|
||||
blake2s_update( S, block, 64 );
|
||||
secure_zero_memory( block, 64 ); /* Burn the key from stack */
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
{
|
||||
uint32_t _ALIGN(32) m[16];
|
||||
uint32_t _ALIGN(32) v[16];
|
||||
@@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
|
||||
size_t fill = 2 * 64 - left;
|
||||
|
||||
if( inlen > fill )
|
||||
{
|
||||
memcpy( S->buf + left, in, fill ); // Fill buffer
|
||||
S->buflen += fill;
|
||||
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
|
||||
blake2s_increment_counter( S, 64 );
|
||||
blake2s_compress( S, S->buf ); // Compress
|
||||
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
|
||||
S->buflen -= BLAKE2S_BLOCKBYTES;
|
||||
memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left
|
||||
S->buflen -= 64;
|
||||
in += fill;
|
||||
inlen -= fill;
|
||||
}
|
||||
@@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
|
||||
|
||||
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
|
||||
{
|
||||
uint8_t buffer[BLAKE2S_OUTBYTES];
|
||||
uint8_t buffer[32];
|
||||
|
||||
if( S->buflen > BLAKE2S_BLOCKBYTES )
|
||||
if( S->buflen > 64 )
|
||||
{
|
||||
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
|
||||
blake2s_increment_counter( S, 64 );
|
||||
blake2s_compress( S, S->buf );
|
||||
S->buflen -= BLAKE2S_BLOCKBYTES;
|
||||
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
|
||||
S->buflen -= 64;
|
||||
memcpy( S->buf, S->buf + 64, S->buflen );
|
||||
}
|
||||
|
||||
blake2s_increment_counter( S, ( uint32_t )S->buflen );
|
||||
blake2s_set_lastblock( S );
|
||||
memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
|
||||
memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */
|
||||
blake2s_compress( S, S->buf );
|
||||
|
||||
for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
|
||||
@@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
|
||||
#include "blake2-kat.h" /* test data not included */
|
||||
int main( int argc, char **argv )
|
||||
{
|
||||
uint8_t key[BLAKE2S_KEYBYTES];
|
||||
uint8_t key[8];
|
||||
uint8_t buf[KAT_LENGTH];
|
||||
|
||||
for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
key[i] = ( uint8_t )i;
|
||||
|
||||
for( size_t i = 0; i < KAT_LENGTH; ++i )
|
||||
@@ -419,10 +419,10 @@ int main( int argc, char **argv )
|
||||
|
||||
for( size_t i = 0; i < KAT_LENGTH; ++i )
|
||||
{
|
||||
uint8_t hash[BLAKE2S_OUTBYTES];
|
||||
blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
|
||||
uint8_t hash[32];
|
||||
blake2s( hash, buf, key, 32, i, );
|
||||
|
||||
if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
|
||||
if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) )
|
||||
{
|
||||
puts( "error" );
|
||||
return -1;
|
||||
|
@@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n)
|
||||
|
||||
/* blake2.h */
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
BLAKE2S_OUTBYTES = 32,
|
||||
BLAKE2S_KEYBYTES = 32,
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_param
|
||||
{
|
||||
@@ -112,8 +99,8 @@ extern "C" {
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
|
||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_param;
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_state
|
||||
@@ -121,13 +108,13 @@ extern "C" {
|
||||
uint32_t h[8];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
|
||||
uint8_t buf[2 * 64];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_state ;
|
||||
#pragma pack(pop)
|
||||
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[64] );
|
||||
|
||||
// Streaming API
|
||||
int blake2s_init( blake2s_state *S, const uint8_t outlen );
|
||||
|
@@ -95,6 +95,43 @@
|
||||
}
|
||||
*/
|
||||
|
||||
#elif defined(__SSE2__) || defined(__NEON__) // ready for NEON
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
__m128i V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = v128_alignr64( V[3], V[2], 1 ); \
|
||||
V3 = v128_alignr64( V[2], V[3], 1 ); \
|
||||
V6 = v128_alignr64( V[6], V[7], 1 ); \
|
||||
V7 = v128_alignr64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = v128_alignr64( V2, V3, 1 ); \
|
||||
V[3] = v128_alignr64( V3, V2, 1 ); \
|
||||
V[6] = v128_alignr64( V7, V6, 1 ); \
|
||||
V[7] = v128_alignr64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
@@ -131,6 +168,7 @@
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
Reference in New Issue
Block a user