mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.21.0
This commit is contained in:
@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data );
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data );
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
|
||||
@@ -668,6 +668,258 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
}
|
||||
|
||||
// Short cut message expansion when the message data is known to be zero.
|
||||
// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
|
||||
|
||||
#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
// Message expansion optimized for each round.
|
||||
#define ROUND256_8WAY_0 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS1 ) ), \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS0 ) ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS3 ) ), \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS2 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS5 ) ), \
|
||||
_mm256_set1_epi32( CS4 ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS7 ) , \
|
||||
_mm256_set1_epi32( CS6 ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS9 ) , \
|
||||
_mm256_set1_epi32( CS8 ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSB ) , \
|
||||
_mm256_set1_epi32( CSA ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CSD ) , \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSF ) , \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_1 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSA ) , \
|
||||
_mm256_set1_epi32( CSE ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ), \
|
||||
_mm256_set1_epi32( CS4 ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSF ) , \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CS9 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CS6 ) ), \
|
||||
_mm256_set1_epi32( CSD ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ), \
|
||||
_mm256_set1_epi32( CS1 ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS2 ) ), \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS0 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS7 ) , \
|
||||
_mm256_set1_epi32( CSB ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS3 ) , \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS5 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_2 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS8 ) , \
|
||||
_mm256_set1_epi32( CSB ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS0 ) , \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CSC ) ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS2 ) , \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS5 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CSD ) ), \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CSF ) ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CSE ) , \
|
||||
_mm256_set1_epi32( CSA ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ), \
|
||||
_mm256_set1_epi32( CS3 ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS1 ) , \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS7 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS4 ) , \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS9 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_3 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS9 ) , \
|
||||
_mm256_set1_epi32( CS7 ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS1 ) ), \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS3 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ), \
|
||||
_mm256_set1_epi32( CSD ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSE ) , \
|
||||
_mm256_set1_epi32( CSB ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS6 ) ), \
|
||||
_mm256_set1_epi32( CS2 ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSA ) , \
|
||||
_mm256_set1_epi32( CS5 ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS0 ) ), \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS4 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CS8 ) ), \
|
||||
_mm256_set1_epi32( CSF ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_4 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS0 ) , \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS9 ) ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS7 ) , \
|
||||
_mm256_set1_epi32( CS5 ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS4 ) ), \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS2 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSF ) , \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CSA ) ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS1 ) , \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CSE ) ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSC ) , \
|
||||
_mm256_set1_epi32( CSB ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS8 ) , \
|
||||
_mm256_set1_epi32( CS6 ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CSD ) ), \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CS3 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_5 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ), \
|
||||
_mm256_set1_epi32( CS2 ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSA ) , \
|
||||
_mm256_set1_epi32( CS6 ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CSB ) ), \
|
||||
_mm256_set1_epi32( CS0 ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS3 ) , \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS8 ) ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CSD ) ), \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CS4 ) ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS5 ) , \
|
||||
_mm256_set1_epi32( CS7 ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ), \
|
||||
_mm256_set1_epi32( CSF ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS9 ) ), \
|
||||
_mm256_set1_epi32( CS1 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_6 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS5 ) , \
|
||||
_mm256_set1_epi32( CSC ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CSF ) ), \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CS1 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSD ) , \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CSE ) ) );\
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CSA ) ), \
|
||||
_mm256_set1_epi32( CS4 ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS7 ) ), \
|
||||
_mm256_set1_epi32( CS0 ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS3 ) , \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS2 ) , \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CS9 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSB ) , \
|
||||
_mm256_set1_epi32( CS8 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_7 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CSB ) ), \
|
||||
_mm256_set1_epi32( CSD ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSE ) , \
|
||||
_mm256_set1_epi32( CS7 ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS1 ) , \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CS9 ) ), \
|
||||
_mm256_set1_epi32( CS3 ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS0 ) , \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS5 ) ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CS4 ) ), \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CSF ) ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS6 ) , \
|
||||
_mm256_set1_epi32( CS8 ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ), \
|
||||
_mm256_set1_epi32( CS2 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_8 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSF ), \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CS6 ) ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS9 ) , \
|
||||
_mm256_set1_epi32( CSE ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS3 ) , \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CSB ) ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CS8 ) ), \
|
||||
_mm256_set1_epi32( CS0 ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS2 ) , \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CS7 ) ), \
|
||||
_mm256_set1_epi32( CSD ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS4 ) ), \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS1 ) ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS5 ) , \
|
||||
_mm256_set1_epi32( CSA ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_8WAY_9 \
|
||||
{ \
|
||||
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS2 ) , \
|
||||
_mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ) ); \
|
||||
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS4 ) , \
|
||||
_mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ) ); \
|
||||
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS6 ) , \
|
||||
_mm256_set1_epi32( CS7 ) ); \
|
||||
G256_8WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm256_xor_si256( M1, _mm256_set1_epi32( CS5 ) ), \
|
||||
_mm256_set1_epi32( CS1 ) ); \
|
||||
G256_8WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm256_xor_si256( MF, _mm256_set1_epi32( CSB ) ), \
|
||||
_mm256_set1_epi32( CSF ) ); \
|
||||
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSE ) , \
|
||||
_mm256_set1_epi32( CS9 ) ); \
|
||||
G256_8WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm256_xor_si256( M3, _mm256_set1_epi32( CSC ) ), \
|
||||
_mm256_set1_epi32( CS3 ) ); \
|
||||
G256_8WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm256_xor_si256( MD, _mm256_set1_epi32( CS0 ) ), \
|
||||
_mm256_xor_si256( M0, _mm256_set1_epi32( CSD ) ) ); \
|
||||
}
|
||||
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
@@ -834,9 +1086,9 @@ do { \
|
||||
}
|
||||
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data )
|
||||
void *data )
|
||||
{
|
||||
const __m256i *M = (const __m256i*)data;
|
||||
__m256i *M = (__m256i*)data;
|
||||
__m256i *V = (__m256i*)midstate;
|
||||
const __m256i *H = (const __m256i*)midhash;
|
||||
|
||||
@@ -857,6 +1109,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[14] = m256_const1_32( CS6 );
|
||||
V[15] = m256_const1_32( CS7 );
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = m256_const1_32( 0x80000000 );
|
||||
M[13] = m256_one_32;
|
||||
M[15] = m256_const1_32( 80*8 );
|
||||
|
||||
M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
|
||||
|
||||
// G0
|
||||
GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
|
||||
|
||||
@@ -868,21 +1131,45 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
|
||||
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
|
||||
|
||||
// G2,G3
|
||||
GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
// G2
|
||||
// GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS5 ), M[ 4] ) );
|
||||
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
|
||||
V[10] = _mm256_add_epi32( V[10], V[14] );
|
||||
V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
|
||||
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
|
||||
_mm256_set1_epi32( CS4 ) );
|
||||
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 8 );
|
||||
V[10] = _mm256_add_epi32( V[10], V[14] );
|
||||
V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 7 );
|
||||
|
||||
// G3
|
||||
// GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
|
||||
_mm256_set1_epi32( CS7 ) );
|
||||
V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
|
||||
V[11] = _mm256_add_epi32( V[11], V[15] );
|
||||
V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
|
||||
_mm256_set1_epi32( CS6 ) );
|
||||
V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 8 );
|
||||
V[11] = _mm256_add_epi32( V[11], V[15] );
|
||||
V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 7 );
|
||||
|
||||
// G4
|
||||
V[ 0] = _mm256_add_epi32( V[ 0],
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
|
||||
V[ 0] = _mm256_add_epi32( V[ 0], _mm256_set1_epi32( CS9 ) );
|
||||
|
||||
// G5
|
||||
// GS_8WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
|
||||
|
||||
// G6
|
||||
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
|
||||
_mm256_set1_epi32( CSD ) );
|
||||
|
||||
// G7
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
|
||||
_mm256_set1_epi32( CSF ) );
|
||||
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
|
||||
V[ 3] = _mm256_add_epi32( V[ 3],
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
|
||||
@@ -893,47 +1180,40 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
{
|
||||
__m256i *H = (__m256i*)final_hash;
|
||||
const __m256i *h = (const __m256i*)midhash;
|
||||
const __m256i *v= (const __m256i*)midstate;
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m256i M0, M1, M2, M3, M4, MD, MF;
|
||||
__m256i MDxorCSC;
|
||||
|
||||
V0 = v[ 0];
|
||||
V1 = v[ 1];
|
||||
V2 = v[ 2];
|
||||
V3 = v[ 3];
|
||||
V4 = v[ 4];
|
||||
V5 = v[ 5];
|
||||
V6 = v[ 6];
|
||||
V7 = v[ 7];
|
||||
V8 = v[ 8];
|
||||
V9 = v[ 9];
|
||||
VA = v[10];
|
||||
VB = v[11];
|
||||
VC = v[12];
|
||||
VD = v[13];
|
||||
VE = v[14];
|
||||
VF = v[15];
|
||||
V0 = _mm256_load_si256( (__m256i*)midstate + 0 );
|
||||
V1 = _mm256_load_si256( (__m256i*)midstate + 1 );
|
||||
V2 = _mm256_load_si256( (__m256i*)midstate + 2 );
|
||||
V3 = _mm256_load_si256( (__m256i*)midstate + 3 );
|
||||
V4 = _mm256_load_si256( (__m256i*)midstate + 4 );
|
||||
V5 = _mm256_load_si256( (__m256i*)midstate + 5 );
|
||||
V6 = _mm256_load_si256( (__m256i*)midstate + 6 );
|
||||
V7 = _mm256_load_si256( (__m256i*)midstate + 7 );
|
||||
V8 = _mm256_load_si256( (__m256i*)midstate + 8 );
|
||||
V9 = _mm256_load_si256( (__m256i*)midstate + 9 );
|
||||
VA = _mm256_load_si256( (__m256i*)midstate + 10 );
|
||||
VB = _mm256_load_si256( (__m256i*)midstate + 11 );
|
||||
VC = _mm256_load_si256( (__m256i*)midstate + 12 );
|
||||
VD = _mm256_load_si256( (__m256i*)midstate + 13 );
|
||||
VE = _mm256_load_si256( (__m256i*)midstate + 14 );
|
||||
VF = _mm256_load_si256( (__m256i*)midstate + 15 );
|
||||
|
||||
M0 = casti_m256i( data, 0 );
|
||||
M1 = casti_m256i( data, 1 );
|
||||
M2 = casti_m256i( data, 2 );
|
||||
M3 = casti_m256i( data, 3 );
|
||||
M4 = casti_m256i( data, 4 );
|
||||
M5 = casti_m256i( data, 5 );
|
||||
M6 = casti_m256i( data, 6 );
|
||||
M7 = casti_m256i( data, 7 );
|
||||
M8 = casti_m256i( data, 8 );
|
||||
M9 = casti_m256i( data, 9 );
|
||||
MA = casti_m256i( data, 10 );
|
||||
MB = casti_m256i( data, 11 );
|
||||
MC = casti_m256i( data, 12 );
|
||||
MD = casti_m256i( data, 13 );
|
||||
ME = casti_m256i( data, 14 );
|
||||
MF = casti_m256i( data, 15 );
|
||||
|
||||
// Finish round 0
|
||||
M0 = _mm256_load_si256( (__m256i*)data + 0 );
|
||||
M1 = _mm256_load_si256( (__m256i*)data + 1 );
|
||||
M2 = _mm256_load_si256( (__m256i*)data + 2 );
|
||||
M3 = _mm256_load_si256( (__m256i*)data + 3 );
|
||||
M4 = _mm256_load_si256( (__m256i*)data + 4 );
|
||||
// M5 to MC & ME zero padding & optimised out.
|
||||
MD = _mm256_load_si256( (__m256i*)data + 13 );
|
||||
MF = _mm256_load_si256( (__m256i*)data + 15 );
|
||||
// precalculated MD^CSC, used in round0 G6.
|
||||
MDxorCSC = _mm256_load_si256( (__m256i*)data + 5 );
|
||||
|
||||
// Finish round 0 with nonce in M3
|
||||
// G1
|
||||
V1 = _mm256_add_epi32( V1,
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
|
||||
@@ -947,20 +1227,29 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
|
||||
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
|
||||
_mm256_set1_epi32( CS8 ) ) );
|
||||
VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
|
||||
|
||||
// G5
|
||||
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
// GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
|
||||
_mm256_set1_epi32( CSB ) );
|
||||
VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
|
||||
VB = _mm256_add_epi32( VB, VC );
|
||||
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
|
||||
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
|
||||
_mm256_set1_epi32( CSA ) );
|
||||
VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 8 );
|
||||
VB = _mm256_add_epi32( VB, VC );
|
||||
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );
|
||||
|
||||
// G6
|
||||
VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
|
||||
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
|
||||
V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
|
||||
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
|
||||
@@ -974,19 +1263,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
|
||||
|
||||
// Remaining rounds
|
||||
ROUND_S_8WAY( 1 );
|
||||
ROUND_S_8WAY( 2 );
|
||||
ROUND_S_8WAY( 3 );
|
||||
ROUND_S_8WAY( 4 );
|
||||
ROUND_S_8WAY( 5 );
|
||||
ROUND_S_8WAY( 6 );
|
||||
ROUND_S_8WAY( 7 );
|
||||
ROUND_S_8WAY( 8 );
|
||||
ROUND_S_8WAY( 9 );
|
||||
ROUND_S_8WAY( 0 );
|
||||
ROUND_S_8WAY( 1 );
|
||||
ROUND_S_8WAY( 2 );
|
||||
ROUND_S_8WAY( 3 );
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
ROUND256_8WAY_4;
|
||||
ROUND256_8WAY_5;
|
||||
ROUND256_8WAY_6;
|
||||
ROUND256_8WAY_7;
|
||||
ROUND256_8WAY_8;
|
||||
ROUND256_8WAY_9;
|
||||
ROUND256_8WAY_0;
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
|
||||
const __m256i shuf_bswap32 =
|
||||
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
@@ -1010,6 +1299,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
// Generic with full inline message expansion
|
||||
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
@@ -1036,6 +1326,257 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
}
|
||||
|
||||
// Short cut message expansion when the message data is known to be zero.
|
||||
// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
|
||||
|
||||
#define G256_16WAY_ALT( a, b, c, d, m0, m1 ) \
|
||||
{ \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m0 ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m1 ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
// Message expansion optimized for each round.
|
||||
#define ROUND256_16WAY_0 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS1 ) ), \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS0 ) ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS3 ) ), \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS2 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS5 ) ), \
|
||||
_mm512_set1_epi32( CS4 ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS7 ) , \
|
||||
_mm512_set1_epi32( CS6 ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS9 ) , \
|
||||
_mm512_set1_epi32( CS8 ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSB ) , \
|
||||
_mm512_set1_epi32( CSA ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CSD ) , \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSF ) , \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_1 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSA ) , \
|
||||
_mm512_set1_epi32( CSE ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ), \
|
||||
_mm512_set1_epi32( CS4 ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSF ) , \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CS9 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CS6 ) ), \
|
||||
_mm512_set1_epi32( CSD ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ), \
|
||||
_mm512_set1_epi32( CS1 ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS2 ) ), \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS0 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS7 ) , \
|
||||
_mm512_set1_epi32( CSB ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS3 ) , \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS5 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_2 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS8 ) , \
|
||||
_mm512_set1_epi32( CSB ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS0 ) , \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CSC ) ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS2 ) , \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS5 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CSD ) ), \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CSF ) ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CSE ) , \
|
||||
_mm512_set1_epi32( CSA ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ), \
|
||||
_mm512_set1_epi32( CS3 ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS1 ) , \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS7 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS4 ) , \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS9 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_3 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS9 ) , \
|
||||
_mm512_set1_epi32( CS7 ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS1 ) ), \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS3 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ), \
|
||||
_mm512_set1_epi32( CSD ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSE ) , \
|
||||
_mm512_set1_epi32( CSB ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS6 ) ), \
|
||||
_mm512_set1_epi32( CS2 ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSA ) , \
|
||||
_mm512_set1_epi32( CS5 ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS0 ) ), \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS4 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CS8 ) ), \
|
||||
_mm512_set1_epi32( CSF ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_4 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS0 ) , \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS9 ) ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS7 ) , \
|
||||
_mm512_set1_epi32( CS5 ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS4 ) ), \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS2 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSF ) , \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CSA ) ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS1 ) , \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CSE ) ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSC ) , \
|
||||
_mm512_set1_epi32( CSB ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS8 ) , \
|
||||
_mm512_set1_epi32( CS6 ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CSD ) ), \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CS3 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_5 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ), \
|
||||
_mm512_set1_epi32( CS2 ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSA ) , \
|
||||
_mm512_set1_epi32( CS6 ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CSB ) ), \
|
||||
_mm512_set1_epi32( CS0 ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS3 ) , \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS8 ) ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CSD ) ), \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CS4 ) ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS5 ) , \
|
||||
_mm512_set1_epi32( CS7 ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ), \
|
||||
_mm512_set1_epi32( CSF ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS9 ) ), \
|
||||
_mm512_set1_epi32( CS1 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_6 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS5 ) , \
|
||||
_mm512_set1_epi32( CSC ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CSF ) ), \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CS1 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSD ) , \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CSE ) ) );\
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CSA ) ), \
|
||||
_mm512_set1_epi32( CS4 ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS7 ) ), \
|
||||
_mm512_set1_epi32( CS0 ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS3 ) , \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS2 ) , \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CS9 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSB ) , \
|
||||
_mm512_set1_epi32( CS8 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_7 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CSB ) ), \
|
||||
_mm512_set1_epi32( CSD ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSE ) , \
|
||||
_mm512_set1_epi32( CS7 ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS1 ) , \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CS9 ) ), \
|
||||
_mm512_set1_epi32( CS3 ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS0 ) , \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS5 ) ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CS4 ) ), \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CSF ) ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS6 ) , \
|
||||
_mm512_set1_epi32( CS8 ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ), \
|
||||
_mm512_set1_epi32( CS2 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_8 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSF ), \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CS6 ) ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS9 ) , \
|
||||
_mm512_set1_epi32( CSE ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS3 ) , \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CSB ) ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CS8 ) ), \
|
||||
_mm512_set1_epi32( CS0 ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS2 ) , \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CS7 ) ), \
|
||||
_mm512_set1_epi32( CSD ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS4 ) ), \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS1 ) ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS5 ) , \
|
||||
_mm512_set1_epi32( CSA ) ); \
|
||||
}
|
||||
|
||||
#define ROUND256_16WAY_9 \
|
||||
{ \
|
||||
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS2 ) , \
|
||||
_mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ) ); \
|
||||
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS4 ) , \
|
||||
_mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ) ); \
|
||||
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS6 ) , \
|
||||
_mm512_set1_epi32( CS7 ) ); \
|
||||
G256_16WAY_ALT( V3, V7, VB, VF, \
|
||||
_mm512_xor_si512( M1, _mm512_set1_epi32( CS5 ) ), \
|
||||
_mm512_set1_epi32( CS1 ) ); \
|
||||
G256_16WAY_ALT( V0, V5, VA, VF, \
|
||||
_mm512_xor_si512( MF, _mm512_set1_epi32( CSB ) ), \
|
||||
_mm512_set1_epi32( CSF ) ); \
|
||||
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSE ) , \
|
||||
_mm512_set1_epi32( CS9 ) ); \
|
||||
G256_16WAY_ALT( V2, V7, V8, VD, \
|
||||
_mm512_xor_si512( M3, _mm512_set1_epi32( CSC ) ), \
|
||||
_mm512_set1_epi32( CS3 ) ); \
|
||||
G256_16WAY_ALT( V3, V4, V9, VE, \
|
||||
_mm512_xor_si512( MD, _mm512_set1_epi32( CS0 ) ), \
|
||||
_mm512_xor_si512( M0, _mm512_set1_epi32( CSD ) ) ); \
|
||||
}
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
@@ -1208,9 +1749,9 @@ do { \
|
||||
// second part is run for each nonce using the precalculated midstate and the
|
||||
// hash from the first block.
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data )
|
||||
void *data )
|
||||
{
|
||||
const __m512i *M = (const __m512i*)data;
|
||||
__m512i *M = (__m512i*)data;
|
||||
__m512i *V = (__m512i*)midstate;
|
||||
const __m512i *H = (const __m512i*)midhash;
|
||||
|
||||
@@ -1231,10 +1772,21 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[14] = m512_const1_32( CS6 );
|
||||
V[15] = m512_const1_32( CS7 );
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = m512_const1_32( 0x80000000 );
|
||||
M[13] = m512_one_32;
|
||||
M[15] = m512_const1_32( 80*8 );
|
||||
|
||||
M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
|
||||
|
||||
// G0
|
||||
GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
|
||||
|
||||
// G1, nonce is in M[3]
|
||||
// G1
|
||||
// GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
|
||||
V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
|
||||
@@ -1243,14 +1795,35 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
|
||||
V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
|
||||
|
||||
// G2,G3
|
||||
GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
// G2
|
||||
// GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CS5 ), M[ 4] ) );
|
||||
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 16 );
|
||||
V[10] = _mm512_add_epi32( V[10], V[14] );
|
||||
V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 12 );
|
||||
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
|
||||
_mm512_set1_epi32( CS4 ) );
|
||||
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 8 );
|
||||
V[10] = _mm512_add_epi32( V[10], V[14] ); \
|
||||
V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 7 );
|
||||
|
||||
// G3
|
||||
// GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
|
||||
_mm512_set1_epi32( CS7 ) );
|
||||
V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 16 );
|
||||
V[11] = _mm512_add_epi32( V[11], V[15] );
|
||||
V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 12 );
|
||||
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
|
||||
_mm512_set1_epi32( CS6 ) );
|
||||
V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 8 );
|
||||
V[11] = _mm512_add_epi32( V[11], V[15] ); \
|
||||
V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 7 );
|
||||
|
||||
// G4
|
||||
// GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
|
||||
V[ 0] = _mm512_add_epi32( V[ 0],
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) );
|
||||
V[ 0] = _mm512_add_epi32( V[ 0], _mm512_set1_epi32( CS9 ) );
|
||||
|
||||
// G5
|
||||
// GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
|
||||
@@ -1258,11 +1831,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// G6
|
||||
// GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
|
||||
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
|
||||
_mm512_set1_epi32( CSD ) );
|
||||
// G7
|
||||
// GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
|
||||
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
|
||||
_mm512_set1_epi32( CSF ) );
|
||||
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
|
||||
V[ 3] = _mm512_add_epi32( V[ 3],
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
|
||||
@@ -1273,45 +1846,38 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
{
|
||||
__m512i *H = (__m512i*)final_hash;
|
||||
const __m512i *h = (const __m512i*)midhash;
|
||||
const __m512i *v= (const __m512i*)midstate;
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m512i M0, M1, M2, M3, M4, MD, MF;
|
||||
__m512i MDxorCSC;
|
||||
|
||||
V0 = v[ 0];
|
||||
V1 = v[ 1];
|
||||
V2 = v[ 2];
|
||||
V3 = v[ 3];
|
||||
V4 = v[ 4];
|
||||
V5 = v[ 5];
|
||||
V6 = v[ 6];
|
||||
V7 = v[ 7];
|
||||
V8 = v[ 8];
|
||||
V9 = v[ 9];
|
||||
VA = v[10];
|
||||
VB = v[11];
|
||||
VC = v[12];
|
||||
VD = v[13];
|
||||
VE = v[14];
|
||||
VF = v[15];
|
||||
V0 = _mm512_load_si512( (__m512i*)midstate + 0 );
|
||||
V1 = _mm512_load_si512( (__m512i*)midstate + 1 );
|
||||
V2 = _mm512_load_si512( (__m512i*)midstate + 2 );
|
||||
V3 = _mm512_load_si512( (__m512i*)midstate + 3 );
|
||||
V4 = _mm512_load_si512( (__m512i*)midstate + 4 );
|
||||
V5 = _mm512_load_si512( (__m512i*)midstate + 5 );
|
||||
V6 = _mm512_load_si512( (__m512i*)midstate + 6 );
|
||||
V7 = _mm512_load_si512( (__m512i*)midstate + 7 );
|
||||
V8 = _mm512_load_si512( (__m512i*)midstate + 8 );
|
||||
V9 = _mm512_load_si512( (__m512i*)midstate + 9 );
|
||||
VA = _mm512_load_si512( (__m512i*)midstate + 10 );
|
||||
VB = _mm512_load_si512( (__m512i*)midstate + 11 );
|
||||
VC = _mm512_load_si512( (__m512i*)midstate + 12 );
|
||||
VD = _mm512_load_si512( (__m512i*)midstate + 13 );
|
||||
VE = _mm512_load_si512( (__m512i*)midstate + 14 );
|
||||
VF = _mm512_load_si512( (__m512i*)midstate + 15 );
|
||||
|
||||
M0 = casti_m512i( data, 0 );
|
||||
M1 = casti_m512i( data, 1 );
|
||||
M2 = casti_m512i( data, 2 );
|
||||
M3 = casti_m512i( data, 3 );
|
||||
M4 = casti_m512i( data, 4 );
|
||||
M5 = casti_m512i( data, 5 );
|
||||
M6 = casti_m512i( data, 6 );
|
||||
M7 = casti_m512i( data, 7 );
|
||||
M8 = casti_m512i( data, 8 );
|
||||
M9 = casti_m512i( data, 9 );
|
||||
MA = casti_m512i( data, 10 );
|
||||
MB = casti_m512i( data, 11 );
|
||||
MC = casti_m512i( data, 12 );
|
||||
MD = casti_m512i( data, 13 );
|
||||
ME = casti_m512i( data, 14 );
|
||||
MF = casti_m512i( data, 15 );
|
||||
M0 = _mm512_load_si512( (__m512i*)data + 0 );
|
||||
M1 = _mm512_load_si512( (__m512i*)data + 1 );
|
||||
M2 = _mm512_load_si512( (__m512i*)data + 2 );
|
||||
M3 = _mm512_load_si512( (__m512i*)data + 3 );
|
||||
M4 = _mm512_load_si512( (__m512i*)data + 4 );
|
||||
// M5 to MC & ME are zero padding and optimised out
|
||||
MD = _mm512_load_si512( (__m512i*)data + 13 );
|
||||
MF = _mm512_load_si512( (__m512i*)data + 15 );
|
||||
// cache for precalculated MD^CSC, used in round0 G6.
|
||||
MDxorCSC = _mm512_load_si512( (__m512i*)data + 5 );
|
||||
|
||||
// Finish round 0 with the nonce (M3) now available
|
||||
// G0
|
||||
@@ -1336,21 +1902,30 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
VA = _mm512_add_epi32( VA, VF );
|
||||
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
|
||||
V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
|
||||
_mm512_set1_epi32( CS8 ) ) );
|
||||
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
|
||||
VA = _mm512_add_epi32( VA, VF );
|
||||
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
|
||||
|
||||
// G5
|
||||
GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
// GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
|
||||
_mm512_set1_epi32( CSB ) );
|
||||
VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 16 );
|
||||
VB = _mm512_add_epi32( VB, VC );
|
||||
V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 12 );
|
||||
V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
|
||||
_mm512_set1_epi32( CSA ) );
|
||||
VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 8 );
|
||||
VB = _mm512_add_epi32( VB, VC );
|
||||
V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 7 );
|
||||
|
||||
// G6
|
||||
// GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
|
||||
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
|
||||
V8 = _mm512_add_epi32( V8, VD );
|
||||
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
|
||||
V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
|
||||
V2 = _mm512_add_epi32( V2, _mm512_add_epi32( V7, MDxorCSC ) );
|
||||
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
|
||||
V8 = _mm512_add_epi32( V8, VD );
|
||||
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
|
||||
@@ -1364,20 +1939,20 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
V9 = _mm512_add_epi32( V9, VE );
|
||||
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
|
||||
|
||||
// Remaining rounds
|
||||
ROUND_S_16WAY( 1 );
|
||||
ROUND_S_16WAY( 2 );
|
||||
ROUND_S_16WAY( 3 );
|
||||
ROUND_S_16WAY( 4 );
|
||||
ROUND_S_16WAY( 5 );
|
||||
ROUND_S_16WAY( 6 );
|
||||
ROUND_S_16WAY( 7 );
|
||||
ROUND_S_16WAY( 8 );
|
||||
ROUND_S_16WAY( 9 );
|
||||
ROUND_S_16WAY( 0 );
|
||||
ROUND_S_16WAY( 1 );
|
||||
ROUND_S_16WAY( 2 );
|
||||
ROUND_S_16WAY( 3 );
|
||||
// Remaining rounds, optimised
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
ROUND256_16WAY_4;
|
||||
ROUND256_16WAY_5;
|
||||
ROUND256_16WAY_6;
|
||||
ROUND256_16WAY_7;
|
||||
ROUND256_16WAY_8;
|
||||
ROUND256_16WAY_9;
|
||||
ROUND256_16WAY_0;
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 =
|
||||
|
||||
@@ -103,16 +103,16 @@
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_shufl2r_64( V[2], V[3] ); \
|
||||
V3 = mm128_shufl2r_64( V[3], V[2] ); \
|
||||
V6 = mm128_shufl2l_64( V[6], V[7] ); \
|
||||
V7 = mm128_shufl2l_64( V[7], V[6] ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2] ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3] ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7] ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6] ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_shufl2l_64( V2, V3 ); \
|
||||
V[3] = mm128_shufl2l_64( V3, V2 ); \
|
||||
V[6] = mm128_shufl2r_64( V6, V7 ); \
|
||||
V[7] = mm128_shufl2r_64( V7, V6 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7 ); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@@ -49,12 +49,11 @@ extern "C"{
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
const __m512i cc = _mm512_set1_epi64( c ); \
|
||||
x0 = mm512_xorandnot( x0, x2, cc ); \
|
||||
tmp = mm512_xorand( cc, x0, x1 ); \
|
||||
x0 = mm512_xorand( x0, x2, x3 ); \
|
||||
x3 = mm512_xorandnot( x3, x1, x2 ); \
|
||||
x0 = mm512_xorandnot( x0, x3, x2 ); \
|
||||
x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
|
||||
x1 = mm512_xorand( x1, x0, x2 ); \
|
||||
x2 = mm512_xorandnot( x2, x3, x0 ); \
|
||||
x0 = mm512_xoror( x0, x1, x3 ); \
|
||||
@@ -79,7 +78,7 @@ do { \
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set1_epi64x( c ); \
|
||||
const __m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
|
||||
@@ -23,13 +23,26 @@
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#else
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
@@ -60,13 +73,13 @@
|
||||
t = _mm_load_si128(&a0);\
|
||||
a0 = _mm_or_si128(a0,a1);\
|
||||
a2 = _mm_xor_si128(a2,a3);\
|
||||
a1 = _mm_andnot_si128(a1,ALLONE);\
|
||||
a1 = mm128_not( a1 );\
|
||||
a0 = _mm_xor_si128(a0,a3);\
|
||||
a3 = _mm_and_si128(a3,t);\
|
||||
a1 = _mm_xor_si128(a1,a3);\
|
||||
a3 = _mm_xor_si128(a3,a2);\
|
||||
a2 = _mm_and_si128(a2,a0);\
|
||||
a0 = _mm_andnot_si128(a0,ALLONE);\
|
||||
a0 = mm128_not( a0 );\
|
||||
a2 = _mm_xor_si128(a2,a1);\
|
||||
a1 = _mm_or_si128(a1,a3);\
|
||||
t = _mm_xor_si128(t,a1);\
|
||||
@@ -242,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
|
||||
|
||||
|
||||
__m128i CNS128[32];
|
||||
__m128i ALLONE;
|
||||
#if !defined(__SSE4_1__)
|
||||
__m128i MASK;
|
||||
#endif
|
||||
|
||||
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
{
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
@@ -352,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
|
||||
@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces, add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m512_zero;
|
||||
block_buf[13] = m512_one_32;
|
||||
block_buf[14] = m512_zero;
|
||||
block_buf[15] = m512_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m256_zero;
|
||||
block_buf[13] = m256_one_32;
|
||||
block_buf[14] = m256_zero;
|
||||
block_buf[15] = m256_const1_32( 80*8 );
|
||||
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
block_buf[ 4] = m512_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m512_zero;
|
||||
block_buf[13] = m512_one_32;
|
||||
block_buf[14] = m512_zero;
|
||||
block_buf[15] = m512_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces and add padding.
|
||||
// unique nonces.
|
||||
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[ 3] =
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
block_buf[ 4] = m256_const1_32( 0x80000000 );
|
||||
block_buf[ 5] =
|
||||
block_buf[ 6] =
|
||||
block_buf[ 7] =
|
||||
block_buf[ 8] =
|
||||
block_buf[ 9] =
|
||||
block_buf[10] =
|
||||
block_buf[11] =
|
||||
block_buf[12] = m256_zero;
|
||||
block_buf[13] = m256_one_32;
|
||||
block_buf[14] = m256_zero;
|
||||
block_buf[15] = m256_const1_32( 80*8 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
|
||||
X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
|
||||
// Except X[ 9] which is part of W[ 0] from the third group.
|
||||
X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
|
||||
X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
|
||||
SSG2_0x( W[ 2] ) ), W[ 1] );
|
||||
X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
|
||||
W[ 2] );
|
||||
X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
|
||||
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
W[ 6] );
|
||||
X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
|
||||
W[ 7] );
|
||||
X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
|
||||
W[ 8] );
|
||||
X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
|
||||
X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
|
||||
X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
|
||||
X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
|
||||
X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
|
||||
X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
|
||||
X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
|
||||
X[14] = SSG2_0x( W[15] );
|
||||
X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );
|
||||
|
||||
X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in + 1 );
|
||||
C = _mm256_load_si256( state_in + 2 );
|
||||
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
G = _mm256_load_si256( state_mid + 6 );
|
||||
H = _mm256_load_si256( state_mid + 7 );
|
||||
|
||||
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
|
||||
#endif
|
||||
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
|
||||
W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
|
||||
W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
|
||||
W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
|
||||
W[ 2] ) );
|
||||
W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
|
||||
W[ 3] ) );
|
||||
W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
|
||||
W[ 4] ) );
|
||||
W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
|
||||
W[ 5] ) );
|
||||
W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
|
||||
W[ 6] ) );
|
||||
W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
|
||||
W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
|
||||
W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
|
||||
W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
|
||||
W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
|
||||
W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
|
||||
W[ 7] ) );
|
||||
W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
|
||||
W[ 8] ) );
|
||||
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
SHA256x8_MSG_EXPANSION( W );
|
||||
|
||||
W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
||||
W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
||||
W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
||||
W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
||||
W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
||||
W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
||||
W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
||||
W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
||||
W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||
W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
SHA256x8_MSG_EXPANSION( W );
|
||||
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
|
||||
// precalculate constant part msg expansion for second iteration.
|
||||
X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
// X is pre-expanded constant part of msg for second group, rounds 16 to 31.
|
||||
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
|
||||
// Except X[ 9] which is used to pre-expand part of W[ 0] from the third
|
||||
// group, rounds 32 to 48.
|
||||
X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
|
||||
X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
|
||||
SSG2_0x16( W[ 2] ) ), W[ 1] );
|
||||
X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
|
||||
W[ 2] );
|
||||
X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
|
||||
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
W[ 6] );
|
||||
X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
|
||||
W[ 7] );
|
||||
X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
|
||||
W[ 8] );
|
||||
X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
|
||||
X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
|
||||
X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
|
||||
X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
|
||||
X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
|
||||
X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
|
||||
X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
|
||||
X[14] = SSG2_0x16( W[15] );
|
||||
X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );
|
||||
|
||||
X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
|
||||
|
||||
A = _mm512_load_si512( state_in );
|
||||
B = _mm512_load_si512( state_in + 1 );
|
||||
C = _mm512_load_si512( state_in + 2 );
|
||||
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// update precalculated msg expansion with new nonce: W[3].
|
||||
// inject nonce, W[3], to complete msg expansion.
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
|
||||
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
|
||||
W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
|
||||
W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
|
||||
W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
|
||||
W[ 2] ) );
|
||||
W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
|
||||
W[ 3] ) );
|
||||
W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
|
||||
W[ 4] ) );
|
||||
W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
|
||||
W[ 5] ) );
|
||||
W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
|
||||
W[ 6] ) );
|
||||
W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
|
||||
W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
|
||||
W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
|
||||
W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
|
||||
W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
|
||||
W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
|
||||
W[ 7] ) );
|
||||
W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
|
||||
W[ 8] ) );
|
||||
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
SHA256x16_MSG_EXPANSION( W );
|
||||
|
||||
W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
||||
W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
||||
W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
||||
W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
||||
W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
||||
W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
||||
W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
||||
W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
||||
W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||
W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
SHA256x16_MSG_EXPANSION( W );
|
||||
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16]; memcpy_512( W, data, 16 );
|
||||
// Value for H at round 60, before adding K, to produce valid final hash
|
||||
//where H == 0.
|
||||
// Value for H at round 60, before adding K, needed to produce valid final
|
||||
// hash where H == 0.
|
||||
// H_ = -( H256[7] + K256[60] );
|
||||
const __m512i H_ = m512_const1_32( 0x136032ED );
|
||||
|
||||
|
||||
@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =
|
||||
|
||||
#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
|
||||
|
||||
#define REDUCE4w(x) \
|
||||
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
|
||||
_mm512_srai_epi16( x, 8 ) )
|
||||
|
||||
/*
|
||||
#define REDUCE4w(x) \
|
||||
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
|
||||
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
|
||||
*/
|
||||
|
||||
#define EXTRA_REDUCE_S4w(x)\
|
||||
#define EXTRA_REDUCE_S4w(x) \
|
||||
_mm512_sub_epi16( x, _mm512_and_si512( \
|
||||
m512_const1_64( 0x0101010101010101 ), \
|
||||
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
|
||||
@@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
|
||||
|
||||
#define DO_REDUCE_FULL_S4w(i) \
|
||||
do { \
|
||||
X(i) = REDUCE4w( X(i) ); \
|
||||
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
|
||||
X(i) = REDUCE4w( X(i) ); \
|
||||
X(i) = EXTRA_REDUCE_S4w( X(i) ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
@@ -431,10 +437,6 @@ void fft64_4way( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
|
||||
|
||||
|
||||
// targetted
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
@@ -443,25 +445,25 @@ do { \
|
||||
X(i) = _mm512_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
do { \
|
||||
__m512i v = X(j); \
|
||||
X(j) = _mm512_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -501,12 +503,11 @@ do { \
|
||||
// Transpose the FFT state with a revbin order permutation
|
||||
// on the rows and the column.
|
||||
// This will make the full FFT_64 in order.
|
||||
#define INTERLEAVE(i,j) \
|
||||
#define INTERLEAVE( i, j ) \
|
||||
do { \
|
||||
__m512i t1= X(i); \
|
||||
__m512i t2= X(j); \
|
||||
X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
|
||||
X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
|
||||
__m512i u = X(j); \
|
||||
X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_unpacklo_epi16( X(i), u ); \
|
||||
} while(0)
|
||||
|
||||
INTERLEAVE( 1, 0 );
|
||||
@@ -534,10 +535,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
do { \
|
||||
__m512i u = X(j); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w[n] ); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w ); \
|
||||
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm512_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -558,15 +559,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S4w( 0 );
|
||||
DO_REDUCE_FULL_S4w( 1 );
|
||||
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
|
||||
// Temp space to help for interleaving in the end
|
||||
__m512i B[8];
|
||||
__m512i *A = (__m512i*) a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
/* Size-2 butterflies */
|
||||
for ( i = 0; i<8; i++ )
|
||||
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
|
||||
__m512i *X = (__m512i*)x;
|
||||
__m512i *A = (__m512i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )
|
||||
|
||||
__m512i *X = (__m512i*)x;
|
||||
__m512i *A = (__m512i*)a;
|
||||
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
|
||||
|
||||
#define UNPACK( i ) \
|
||||
do { \
|
||||
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
// We split the round function in two halfes
|
||||
// so as to insert some independent computations in between
|
||||
|
||||
// generic
|
||||
#if 0
|
||||
#define SUM7_00 0
|
||||
#define SUM7_01 1
|
||||
#define SUM7_02 2
|
||||
#define SUM7_03 3
|
||||
#define SUM7_04 4
|
||||
#define SUM7_05 5
|
||||
#define SUM7_06 6
|
||||
|
||||
#define SUM7_10 1
|
||||
#define SUM7_11 2
|
||||
#define SUM7_12 3
|
||||
#define SUM7_13 4
|
||||
#define SUM7_14 5
|
||||
#define SUM7_15 6
|
||||
#define SUM7_16 0
|
||||
|
||||
#define SUM7_20 2
|
||||
#define SUM7_21 3
|
||||
#define SUM7_22 4
|
||||
#define SUM7_23 5
|
||||
#define SUM7_24 6
|
||||
#define SUM7_25 0
|
||||
#define SUM7_26 1
|
||||
|
||||
#define SUM7_30 3
|
||||
#define SUM7_31 4
|
||||
#define SUM7_32 5
|
||||
#define SUM7_33 6
|
||||
#define SUM7_34 0
|
||||
#define SUM7_35 1
|
||||
#define SUM7_36 2
|
||||
|
||||
#define SUM7_40 4
|
||||
#define SUM7_41 5
|
||||
#define SUM7_42 6
|
||||
#define SUM7_43 0
|
||||
#define SUM7_44 1
|
||||
#define SUM7_45 2
|
||||
#define SUM7_46 3
|
||||
|
||||
#define SUM7_50 5
|
||||
#define SUM7_51 6
|
||||
#define SUM7_52 0
|
||||
#define SUM7_53 1
|
||||
#define SUM7_54 2
|
||||
#define SUM7_55 3
|
||||
#define SUM7_56 4
|
||||
|
||||
#define SUM7_60 6
|
||||
#define SUM7_61 0
|
||||
#define SUM7_62 1
|
||||
#define SUM7_63 2
|
||||
#define SUM7_64 3
|
||||
#define SUM7_65 4
|
||||
#define SUM7_66 5
|
||||
|
||||
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
|
||||
|
||||
#define PERM_0(d,a) /* XOR 1 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 1 ); \
|
||||
d##h = shufxor( a##h, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_1(d,a) /* XOR 6 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 2 ); \
|
||||
d##h = shufxor( a##l, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_2(d,a) /* XOR 2 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 2 ); \
|
||||
d##h = shufxor( a##h, 2 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_3(d,a) /* XOR 3 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##l, 3 ); \
|
||||
d##h = shufxor( a##h, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_4(d,a) /* XOR 5 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 1 ); \
|
||||
d##h = shufxor( a##l, 1 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_5(d,a) /* XOR 7 */ \
|
||||
do { \
|
||||
d##l = shufxor( a##h, 3 ); \
|
||||
d##h = shufxor( a##l, 3 ); \
|
||||
} while(0)
|
||||
|
||||
#define PERM_6(d,a) /* XOR 4 */ \
|
||||
do { \
|
||||
d##l = a##h; \
|
||||
d##h = a##l; \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
// targetted
|
||||
|
||||
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/yespower/yespower.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -31,6 +32,9 @@
|
||||
// Config
|
||||
#define MINOTAUR_ALGO_COUNT 16
|
||||
|
||||
static const yespower_params_t minotaurx_yespower_params =
|
||||
{ YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
|
||||
|
||||
typedef struct TortureNode TortureNode;
|
||||
typedef struct TortureGarden TortureGarden;
|
||||
|
||||
@@ -59,20 +63,22 @@ struct TortureGarden
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_sha512_context sha512;
|
||||
|
||||
struct TortureNode {
|
||||
struct TortureNode
|
||||
{
|
||||
unsigned int algo;
|
||||
TortureNode *child[2];
|
||||
} nodes[22];
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
|
||||
static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
unsigned int algo )
|
||||
static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
unsigned int algo, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
|
||||
switch (algo) {
|
||||
switch ( algo )
|
||||
{
|
||||
case 0:
|
||||
sph_blake512_init(&garden->blake);
|
||||
sph_blake512(&garden->blake, input, 64);
|
||||
@@ -97,14 +103,14 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_echo512(&garden->echo, input, 64);
|
||||
sph_echo512_close(&garden->echo, hash);
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 4:
|
||||
#if defined(__AES__)
|
||||
fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
#else
|
||||
sph_fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 5:
|
||||
#if defined(__AES__)
|
||||
groestl512_full( &garden->groestl, (char*)hash, (char*)input, 512 );
|
||||
@@ -113,7 +119,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_groestl512(&garden->groestl, input, 64);
|
||||
sph_groestl512_close(&garden->groestl, hash);
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
case 6:
|
||||
sph_hamsi512_init(&garden->hamsi);
|
||||
sph_hamsi512(&garden->hamsi, input, 64);
|
||||
@@ -164,16 +170,20 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_whirlpool(&garden->whirlpool, input, 64);
|
||||
sph_whirlpool_close(&garden->whirlpool, hash);
|
||||
break;
|
||||
case 16: // minotaurx only, yespower hardcoded for last node
|
||||
rc = yespower_tls( input, 64, &minotaurx_yespower_params,
|
||||
(yespower_binary_t*)hash, thr_id );
|
||||
}
|
||||
|
||||
memcpy(output, hash, 64);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static __thread TortureGarden garden;
|
||||
|
||||
bool initialize_torture_garden()
|
||||
{
|
||||
// Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
|
||||
// Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
|
||||
|
||||
garden.nodes[ 0].child[0] = &garden.nodes[ 1];
|
||||
garden.nodes[ 0].child[1] = &garden.nodes[ 2];
|
||||
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
|
||||
garden.nodes[20].child[1] = &garden.nodes[21];
|
||||
garden.nodes[21].child[0] = NULL;
|
||||
garden.nodes[21].child[1] = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -227,38 +236,45 @@ bool initialize_torture_garden()
|
||||
int minotaur_hash( void *output, const void *input, int thr_id )
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
int rc = 1;
|
||||
|
||||
// Find initial sha512 hash
|
||||
sph_sha512_init( &garden.sha512 );
|
||||
sph_sha512( &garden.sha512, input, 80 );
|
||||
sph_sha512_close( &garden.sha512, hash );
|
||||
|
||||
// algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
|
||||
// if Hamsi is needed but only the first and last functions are
|
||||
// currently known. Abort if either is Hamsi.
|
||||
if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
|
||||
|| ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
|
||||
return 0;
|
||||
|
||||
if ( opt_algo != ALGO_MINOTAURX )
|
||||
{
|
||||
// algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
|
||||
// if Hamsi is needed but only the first and last functions are
|
||||
// currently known. Abort if either is Hamsi.
|
||||
if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
|
||||
|| ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Assign algos to torture garden nodes based on initial hash
|
||||
for ( int i = 0; i < 22; i++ )
|
||||
garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;
|
||||
|
||||
// MinotaurX override algo for last node with yespower
|
||||
if ( opt_algo == ALGO_MINOTAURX )
|
||||
garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
|
||||
|
||||
// Send the initial hash through the torture garden
|
||||
TortureNode *node = &garden.nodes[0];
|
||||
|
||||
while ( node )
|
||||
while ( rc && node )
|
||||
{
|
||||
get_hash( hash, hash, &garden, node->algo );
|
||||
rc = get_hash( hash, hash, &garden, node->algo, thr_id );
|
||||
node = node->child[ hash[63] & 1 ];
|
||||
}
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
return 1;
|
||||
return rc;
|
||||
}
|
||||
|
||||
int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
@@ -277,7 +293,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
edata[19] = n;
|
||||
if ( likely( algo_gate.hash( hash, edata, thr_id ) ) )
|
||||
{
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// hash function has hooks for minotaurx
|
||||
bool register_minotaur_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_minotaur;
|
||||
gate->hash = (void*)&minotaur_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->scanhash = (void*)&scanhash_minotaur;
|
||||
gate->hash = (void*)&minotaur_hash;
|
||||
gate->miner_thread_init = (void*)&initialize_torture_garden;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1136,10 +1136,14 @@ int yespower(yespower_local_t *local,
|
||||
ctx.S0 = S;
|
||||
ctx.S1 = S + Swidth_to_Sbytes1( Swidth );
|
||||
|
||||
// copy prehash, do tail
|
||||
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
|
||||
sha256_update( &sha256_ctx, src+64, srclen-64 );
|
||||
sha256_final( &sha256_ctx, sha256 );
|
||||
if ( srclen == 80 ) // assume 64 byte prehash was done
|
||||
{
|
||||
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
|
||||
sha256_update( &sha256_ctx, src+64, srclen-64 );
|
||||
sha256_final( &sha256_ctx, sha256 );
|
||||
}
|
||||
else
|
||||
sha256_full( sha256, src, srclen );
|
||||
|
||||
if ( version == YESPOWER_0_5 )
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user