This commit is contained in:
Jay D Dee
2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions

View File

@@ -6,15 +6,15 @@
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_4w_ctx;
blake256r14_4x32_context blake_4w_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
blake256r14_4x32_context ctx;
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
blake256r14_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
blake256r14_4x32_init( &blake_4w_ctx );
blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
#if defined(BLAKE_8WAY)
blake256r14_8way_context blake_8w_ctx;
blake256r14_8x32_context blake_8w_ctx;
void blakehash_8way( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r14_8way_context ctx;
blake256r14_8x32_context ctx;
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
blake256r14_8way( &ctx, input + (64<<3), 16 );
blake256r14_8way_close( &ctx, vhash );
blake256r14_8x32( &ctx, input + (64<<3), 16 );
blake256r14_8x32_close( &ctx, vhash );
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256r14_8way_init( &blake_8w_ctx );
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
blake256r14_8x32_init( &blake_8w_ctx );
blake256r14_8x32( &blake_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,

View File

@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
(state)->T1 = T1; \
} while (0)
#if defined(__SSSE3__)
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
}
#else // SSE2
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
MF = v128_bswap32( buf[15] ); \
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4X32( rounds ) \
{ \
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
ROUND_S_4X32_3;
}
#if defined(__SSSE3__)
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
#endif
}
#if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
VD = v256_32( T0 ^ 0x299F31D0 ); \
VE = v256_32( T1 ^ 0x082EFA98 ); \
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
const __m256i shuf_bswap32 = mm256_set2_64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+ 1) ); \
M2 = mm256_bswap_32( *(buf+ 2) ); \
M3 = mm256_bswap_32( *(buf+ 3) ); \
M4 = mm256_bswap_32( *(buf+ 4) ); \
M5 = mm256_bswap_32( *(buf+ 5) ); \
M6 = mm256_bswap_32( *(buf+ 6) ); \
M7 = mm256_bswap_32( *(buf+ 7) ); \
M8 = mm256_bswap_32( *(buf+ 8) ); \
M9 = mm256_bswap_32( *(buf+ 9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
H7 = mm256_xor3( VF, V7, H7 ); \
}
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m256i *H = (__m256i*)final_hash;
@@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_8WAY_3;
}
const __m256i shuf_bswap32 =
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
}
#endif
@@ -1933,8 +1881,6 @@ do { \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
VD = v512_32( T0 ^ 0x299F31D0 ); \
VE = v512_32( T1 ^ 0x082EFA98 ); \
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm512_bswap_32( * buf ); \
M1 = mm512_bswap_32( *(buf+ 1) ); \
M2 = mm512_bswap_32( *(buf+ 2) ); \
M3 = mm512_bswap_32( *(buf+ 3) ); \
M4 = mm512_bswap_32( *(buf+ 4) ); \
M5 = mm512_bswap_32( *(buf+ 5) ); \
M6 = mm512_bswap_32( *(buf+ 6) ); \
M7 = mm512_bswap_32( *(buf+ 7) ); \
M8 = mm512_bswap_32( *(buf+ 8) ); \
M9 = mm512_bswap_32( *(buf+ 9) ); \
MA = mm512_bswap_32( *(buf+10) ); \
MB = mm512_bswap_32( *(buf+11) ); \
MC = mm512_bswap_32( *(buf+12) ); \
MD = mm512_bswap_32( *(buf+13) ); \
ME = mm512_bswap_32( *(buf+14) ); \
MF = mm512_bswap_32( *(buf+15) ); \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
// is constant for every nonce and only needs to be run once per job. The
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
}
// Dfault is 14 rounds, blakecoin & vanilla are 8.
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
}
// Byte swap final hash
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
}
#endif
// Blake-256 4 way
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
static void
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
int rounds )
{
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
// Blake-256 8 way
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static void
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
}
static void
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
//Blake-256 16 way AVX512
static void
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
}
static void
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
sc->ptr = ptr;
}
static void
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
}
static void
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
void
blake256_16way_init(void *cc)
blake256_16x32_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256_16way_update(void *cc, const void *data, size_t len)
blake256_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256_16way_close(void *cc, void *dst)
blake256_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void
blake256_16way_update_le(void *cc, const void *data, size_t len)
blake256_16x32_update_le(void *cc, const void *data, size_t len)
{
blake32_16way_le(cc, data, len);
}
void
blake256_16way_close_le(void *cc, void *dst)
blake256_16x32_close_le(void *cc, void *dst)
{
blake32_16way_close_le(cc, 0, 0, dst, 8);
}
void blake256r14_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256r14_16way_update(void *cc, const void *data, size_t len)
blake256r14_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r14_16way_close(void *cc, void *dst)
blake256r14_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void blake256r8_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_16way_init( cc, IV256, 8 );
}
void
blake256r8_16way_update(void *cc, const void *data, size_t len)
blake256r8_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r8_16way_close(void *cc, void *dst)
blake256r8_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
void
blake256_4x32_init(void *ctx)
{
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( ctx, IV256, 14 );
}
void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
// Blake-256 8 way
void
blake256_8way_init(void *cc)
blake256_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256_8way_update(void *cc, const void *data, size_t len)
blake256_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256_8way_close(void *cc, void *dst)
blake256_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
void
blake256_8way_update_le(void *cc, const void *data, size_t len)
blake256_8x32_update_le(void *cc, const void *data, size_t len)
{
blake32_8way_le(cc, data, len);
}
void
blake256_8way_close_le(void *cc, void *dst)
blake256_8x32_close_le(void *cc, void *dst)
{
blake32_8way_close_le(cc, 0, 0, dst, 8);
}
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
// 14 rounds Blake, Decred
void blake256r14_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( cc, IV256, 14 );
}
void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)
#if defined(__AVX2__)
void blake256r14_8way_init(void *cc)
void blake256r14_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256r14_8way_update(void *cc, const void *data, size_t len)
blake256r14_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r14_8way_close(void *cc, void *dst)
blake256r14_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
// 8 rounds Blakecoin, Vanilla
void blake256r8_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
blake32_4x32_init( cc, IV256, 8 );
}
void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)
#if defined (__AVX2__)
void blake256r8_8way_init(void *cc)
void blake256r8_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_8way_init( cc, IV256, 8 );
}
void
blake256r8_8way_update(void *cc, const void *data, size_t len)
blake256r8_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r8_8way_close(void *cc, void *dst)
blake256r8_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}

View File

@@ -29,13 +29,6 @@ typedef struct
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds );
/*
void blake256_init( blake256_context *sc );
void blake256_update( blake256_context *sc, const void *data, size_t len );
void blake256_close( blake256_context *sc, void *dst );
void blake256_full( blake256_context *sc, void *dst, const void *data,
size_t len );
*/
//////////////////////////////////
//
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
void blake256_4x32_init(void *ctx);
void blake256_4x32_update(void *ctx, const void *data, size_t len);
void blake256_4x32_close(void *ctx, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds
typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
void blake256r8_4x32_close(void *cc, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
#define blake_4way_small_context blake256_4x32_context
#define blake256_4way_context blake256_4x32_context
#define blake256_4way_init blake256_4x32_init
#define blake256_4way_update blake256_4x32_update
#define blake256_4way_close blake256_4x32_close
#define blake256_4way_update_le blake256_4x32_update_le
#define blake256_4way_close_le blake256_4x32_close_le
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
#define blake256r14_4way_context blake256r14_4x32_context
#define blake256r14_4way_init blake256r14_4x32_init
#define blake256r14_4way_update blake256r14_4x32_update
#define blake256r14_4way_close blake256r14_4x32_close
#define blake256r8_4way_context blake256r14_4x32_context
#define blake256r8_4way_init blake256r14_4x32_init
#define blake256r8_4way_update blake256r14_4x32_update
#define blake256r8_4way_close blake256r14_4x32_close
#ifdef __AVX2__
//////////////////////////////
@@ -107,45 +81,28 @@ typedef struct
} blake_8way_small_context;
// Default 14 rounds
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
typedef blake_8way_small_context blake256_8x32_context;
void blake256_8x32_init(void *cc);
void blake256_8x32_update(void *cc, const void *data, size_t len);
void blake256_8x32_close(void *cc, void *dst);
void blake256_8x32_update_le(void *cc, const void *data, size_t len);
void blake256_8x32_close_le(void *cc, void *dst);
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
void blake256r14_8way_init(void *cc);
void blake256r14_8way_update(void *cc, const void *data, size_t len);
void blake256r14_8way_close(void *cc, void *dst);
typedef blake_8way_small_context blake256r14_8x32_context;
void blake256r14_8x32_init(void *cc);
void blake256r14_8x32_update(void *cc, const void *data, size_t len);
void blake256r14_8x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_8way_small_context blake256r8_8way_context;
void blake256r8_8way_init(void *cc);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
#define blake_8x32_small_context blake256_8way_context
#define blake_8x32_init blake256_8way_init
#define blake_8x32_update blake256_8way_update
#define blake_8x32_close blake256_8way_close
#define blake_8x32_update_le blake256_8way_update_le
#define blake_8x32_close_le blake256_8way_close_le
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
#define blake256r14_8x32_context blake256r14_8way_context
#define blake256r14_8x32_init blake256r14_8way_init
#define blake256r14_8x32_update blake256r14_8way_update
#define blake256r14_8x32_close blake256r14_8way_close
#define blake256r8_8x32_context blake256r14_8way_context
#define blake256r8_8x32_init blake256r14_8way_init
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
typedef blake_8way_small_context blake256r8_8x32_context;
void blake256r8_8x32_init(void *cc);
void blake256r8_8x32_update(void *cc, const void *data, size_t len);
void blake256r8_8x32_close(void *cc, void *dst);
#if defined(SIMD512)
@@ -163,46 +120,29 @@ typedef struct
} blake_16way_small_context __attribute__ ((aligned (128)));
// Default 14 rounds
typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256_16x32_context;
void blake256_16x32_init(void *cc);
void blake256_16x32_update(void *cc, const void *data, size_t len);
void blake256_16x32_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_update_le(void *cc, const void *data, size_t len);
void blake256_16x32_close_le(void *cc, void *dst);
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
void blake256r14_16way_init(void *cc);
void blake256r14_16way_update(void *cc, const void *data, size_t len);
void blake256r14_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256r14_16x32_context;
void blake256r14_16x32_init(void *cc);
void blake256r14_16x32_update(void *cc, const void *data, size_t len);
void blake256r14_16x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_16way_small_context blake256r8_16way_context;
void blake256r8_16way_init(void *cc);
void blake256r8_16way_update(void *cc, const void *data, size_t len);
void blake256r8_16way_close(void *cc, void *dst);
#define blake_16x32_small_context blake256_16way_context
#define blake_16x32_init blake256_16way_init
#define blake_16x32_update blake256_16way_update
#define blake_16x32_close blake256_16way_close
#define blake_16x32_update_le blake256_16way_update_le
#define blake_16x32_close_le blake256_16way_close_le
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
#define blake256r14_16x32_context blake256r14_16way_context
#define blake256r14_16x32_init blake256r14_16way_init
#define blake256r14_16x32_update blake256r14_16way_update
#define blake256r14_16x32_close blake256r14_16way_close
#define blake256r8_16x32_context blake256r8_16way_context
#define blake256r8_16x32_init blake256r8_16way_init
#define blake256r8_16x32_update blake256r8_16way_update
#define blake256r8_16x32_close blake256r8_16way_close
typedef blake_16way_small_context blake256r8_16x32_context;
void blake256r8_16x32_init(void *cc);
void blake256r8_16x32_update(void *cc, const void *data, size_t len);
void blake256r8_16x32_close(void *cc, void *dst);
#endif // AVX512
#endif // AVX2

View File

@@ -14,7 +14,6 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
@@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
#if defined(__AVX2__)
@@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif
#endif

View File

@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
blake2b_8x64_init( &ctx );
blake2b_8x64_update( &ctx, vdata, 80 );
blake2b_8x64_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
blake2b_4x64_ctx ctx;
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, input, 80 );
blake2b_4x64_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, vdata, 80 );
blake2b_4x64_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )

View File

@@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_4x32_state blake2s_4way_state
#define blake2s_4x32_init blake2s_4way_init
#define blake2s_4x32_update blake2s_4way_update
#define blake2s_4x32_final blake2s_4way_final
#define blake2s_4x32_full_blocks blake2s_4way_full_blocks
#if defined(__AVX2__)
@@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_8x32_state blake2s_8way_state
#define blake2s_8x32_init blake2s_8way_init
#define blake2s_8x32_update blake2s_8way_update
#define blake2s_8x32_final blake2s_8way_final
#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
#endif
#if defined(SIMD512)
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
uint64_t inlen );
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
#define blake2s_16x32_state blake2s_16way_state
#define blake2s_16x32_init blake2s_16way_init
#define blake2s_16x32_update blake2s_16way_update
#define blake2s_16x32_final blake2s_16way_final
#endif
#if 0

View File

@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
VD = v512_64( CB5 ^ T0 ); \
VE = v512_64( CB6 ^ T1 ); \
VF = v512_64( CB7 ^ T1 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm512_bswap_64( *(buf+ 0) ); \
M1 = mm512_bswap_64( *(buf+ 1) ); \
M2 = mm512_bswap_64( *(buf+ 2) ); \
M3 = mm512_bswap_64( *(buf+ 3) ); \
M4 = mm512_bswap_64( *(buf+ 4) ); \
M5 = mm512_bswap_64( *(buf+ 5) ); \
M6 = mm512_bswap_64( *(buf+ 6) ); \
M7 = mm512_bswap_64( *(buf+ 7) ); \
M8 = mm512_bswap_64( *(buf+ 8) ); \
M9 = mm512_bswap_64( *(buf+ 9) ); \
MA = mm512_bswap_64( *(buf+10) ); \
MB = mm512_bswap_64( *(buf+11) ); \
MC = mm512_bswap_64( *(buf+12) ); \
MD = mm512_bswap_64( *(buf+13) ); \
ME = mm512_bswap_64( *(buf+14) ); \
MF = mm512_bswap_64( *(buf+15) ); \
ROUND_B_8WAY(0); \
ROUND_B_8WAY(1); \
ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
H7 = mm512_xor3( VF, V7, H7 ); \
}
void blake512_8way_compress( blake_8way_big_context *sc )
void blake512_8x64_compress( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
VE = v512_64( CB6 ^ sc->T1 );
VF = v512_64( CB7 ^ sc->T1 );
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm512_bswap_64( sc->buf[ 0] );
M1 = mm512_bswap_64( sc->buf[ 1] );
M2 = mm512_bswap_64( sc->buf[ 2] );
M3 = mm512_bswap_64( sc->buf[ 3] );
M4 = mm512_bswap_64( sc->buf[ 4] );
M5 = mm512_bswap_64( sc->buf[ 5] );
M6 = mm512_bswap_64( sc->buf[ 6] );
M7 = mm512_bswap_64( sc->buf[ 7] );
M8 = mm512_bswap_64( sc->buf[ 8] );
M9 = mm512_bswap_64( sc->buf[ 9] );
MA = mm512_bswap_64( sc->buf[10] );
MB = mm512_bswap_64( sc->buf[11] );
MC = mm512_bswap_64( sc->buf[12] );
MD = mm512_bswap_64( sc->buf[13] );
ME = mm512_bswap_64( sc->buf[14] );
MF = mm512_bswap_64( sc->buf[15] );
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8x64_big_context *sc )
void blake512_8x64_compress_le( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
sc->ptr = 0;
}
@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
sc->ptr = 0;
}
@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
VD = v256_64( CB5 ^ T0 ); \
VE = v256_64( CB6 ^ T1 ); \
VF = v256_64( CB7 ^ T1 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm256_bswap_64( *(buf+ 0) ); \
M1 = mm256_bswap_64( *(buf+ 1) ); \
M2 = mm256_bswap_64( *(buf+ 2) ); \
M3 = mm256_bswap_64( *(buf+ 3) ); \
M4 = mm256_bswap_64( *(buf+ 4) ); \
M5 = mm256_bswap_64( *(buf+ 5) ); \
M6 = mm256_bswap_64( *(buf+ 6) ); \
M7 = mm256_bswap_64( *(buf+ 7) ); \
M8 = mm256_bswap_64( *(buf+ 8) ); \
M9 = mm256_bswap_64( *(buf+ 9) ); \
MA = mm256_bswap_64( *(buf+10) ); \
MB = mm256_bswap_64( *(buf+11) ); \
MC = mm256_bswap_64( *(buf+12) ); \
MD = mm256_bswap_64( *(buf+13) ); \
ME = mm256_bswap_64( *(buf+14) ); \
MF = mm256_bswap_64( *(buf+15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
}
void blake512_4way_compress( blake_4x64_big_context *sc )
void blake512_4x64_compress( blake_4x64_big_context *sc )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
VD = v256_64( CB5 ^ sc->T0 );
VE = v256_64( CB6 ^ sc->T1 );
VF = v256_64( CB7 ^ sc->T1 );
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm256_bswap_64( sc->buf[ 0] );
M1 = mm256_bswap_64( sc->buf[ 1] );
M2 = mm256_bswap_64( sc->buf[ 2] );
M3 = mm256_bswap_64( sc->buf[ 3] );
M4 = mm256_bswap_64( sc->buf[ 4] );
M5 = mm256_bswap_64( sc->buf[ 5] );
M6 = mm256_bswap_64( sc->buf[ 6] );
M7 = mm256_bswap_64( sc->buf[ 7] );
M8 = mm256_bswap_64( sc->buf[ 8] );
M9 = mm256_bswap_64( sc->buf[ 9] );
MA = mm256_bswap_64( sc->buf[10] );
MB = mm256_bswap_64( sc->buf[11] );
MC = mm256_bswap_64( sc->buf[12] );
MD = mm256_bswap_64( sc->buf[13] );
ME = mm256_bswap_64( sc->buf[14] );
MF = mm256_bswap_64( sc->buf[15] );
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
midstate[15] = VF;
}
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
}
void blake512_4x64_init( blake_4x64_big_context *sc )
void blake512_4x64_init( blake512_4x64_context *sc )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
}
// init, update & close
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
const void *data, size_t len )
{
@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
sc->ptr = 0;
}
@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
VE = v128_64( CB6 ^ sc->T1 );
VF = v128_64( CB7 ^ sc->T1 );
#if defined(__SSSE3__)
const v128u64_t shuf_bswap64 = v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 );
M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
#else // SSE2 & NEON
M0 = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 1] );
M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
ME = v128_bswap64( sc->buf[14] );
MF = v128_bswap64( sc->buf[15] );
#endif
ROUND_B_2X64(0);
ROUND_B_2X64(1);
ROUND_B_2X64(2);

View File

@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
#elif defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
blake256r8_4x32_context blakecoin_4w_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
blake256r8_4x32_context ctx;
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
blake256r8_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
blake256r8_4x32_init( &blakecoin_4w_ctx );
blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

View File

@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
blake512_4way_context ctx;
blake512_4x64_context ctx;
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, input, 80 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, input, 80 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );

View File

@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
v128_t *V = (v128_t*)v;
@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
V[3] = v128_swap64( V[3] ); \
V[2] = v128_shufll32( V[2] )
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
#undef BLAKE2S_ROUND
#else
#define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
b = SPH_ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
#define BLAKE2S_ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
#endif
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
for( size_t i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
#undef BLAKE2S_ROUND
return 0;
}