This commit is contained in:
Jay D Dee
2023-06-14 11:07:40 -04:00
parent de564ccbde
commit 57a6b7b58b
31 changed files with 3724 additions and 3345 deletions

View File

@@ -598,10 +598,10 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
VA = m128_const1_64( 0x13198A2E13198A2E ); \
VB = m128_const1_64( 0x0370734403707344 ); \
V8 = _mm_set1_epi64x( 0x243F6A88243F6A88 ); \
V9 = _mm_set1_epi64x( 0x85A308D385A308D3 ); \
VA = _mm_set1_epi64x( 0x13198A2E13198A2E ); \
VB = _mm_set1_epi64x( 0x0370734403707344 ); \
VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -958,7 +958,6 @@ do { \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
__m256i shuf_bswap32; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -967,16 +966,16 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
const __m256i shuf_bswap32 = mm256_set2_64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -1034,10 +1033,10 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -1100,23 +1099,23 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m256_const1_32( CS0 );
V[ 9] = m256_const1_32( CS1 );
V[10] = m256_const1_32( CS2 );
V[11] = m256_const1_32( CS3 );
V[12] = m256_const1_32( CS4 ^ 0x280 );
V[13] = m256_const1_32( CS5 ^ 0x280 );
V[14] = m256_const1_32( CS6 );
V[15] = m256_const1_32( CS7 );
V[ 8] = _mm256_set1_epi32( CS0 );
V[ 9] = _mm256_set1_epi32( CS1 );
V[10] = _mm256_set1_epi32( CS2 );
V[11] = _mm256_set1_epi32( CS3 );
V[12] = _mm256_set1_epi32( CS4 ^ 0x280 );
V[13] = _mm256_set1_epi32( CS5 ^ 0x280 );
V[14] = _mm256_set1_epi32( CS6 );
V[15] = _mm256_set1_epi32( CS7 );
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
// M[ 5:12, 14 ] are always zero and not needed or used.
// M[ 4], M[ 13], M[15] are constant and are initialized here.
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
M[ 4] = m256_const1_32( 0x80000000 );
M[ 4] = _mm256_set1_epi32( 0x80000000 );
M[13] = m256_one_32;
M[15] = m256_const1_32( 80*8 );
M[15] = _mm256_set1_epi32( 80*8 );
M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
@@ -1278,8 +1277,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_8WAY_3;
const __m256i shuf_bswap32 =
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
@@ -1615,7 +1613,8 @@ do { \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
__m512i shuf_bswap32; \
const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -1624,18 +1623,14 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
VB = _mm512_set1_epi64( 0x0370734403707344 ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -1693,10 +1688,10 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
VB = _mm512_set1_epi64( 0x0370734403707344 ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -1763,23 +1758,23 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m512_const1_32( CS0 );
V[ 9] = m512_const1_32( CS1 );
V[10] = m512_const1_32( CS2 );
V[11] = m512_const1_32( CS3 );
V[12] = m512_const1_32( CS4 ^ 0x280 );
V[13] = m512_const1_32( CS5 ^ 0x280 );
V[14] = m512_const1_32( CS6 );
V[15] = m512_const1_32( CS7 );
V[ 8] = _mm512_set1_epi32( CS0 );
V[ 9] = _mm512_set1_epi32( CS1 );
V[10] = _mm512_set1_epi32( CS2 );
V[11] = _mm512_set1_epi32( CS3 );
V[12] = _mm512_set1_epi32( CS4 ^ 0x280 );
V[13] = _mm512_set1_epi32( CS5 ^ 0x280 );
V[14] = _mm512_set1_epi32( CS6 );
V[15] = _mm512_set1_epi32( CS7 );
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
// M[ 4], M[ 13], M[15] are constant and are initialized here.
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
M[ 4] = m512_const1_32( 0x80000000 );
M[ 4] = _mm512_set1_epi32( 0x80000000 );
M[13] = m512_one_32;
M[15] = m512_const1_32( 80*8 );
M[15] = _mm512_set1_epi32( 80*8 );
M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
@@ -1956,10 +1951,8 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
// Byte swap final hash
const __m512i shuf_bswap32 =
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
0x2c2d2e2f28292a2b, 0x2425262720212223,
0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
@@ -1981,14 +1974,14 @@ static void
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
casti_m128i( ctx->H, 0 ) = _mm_set1_epi64x( 0x6A09E6676A09E667 );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
casti_m128i( ctx->H, 3 ) = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
casti_m128i( ctx->H, 4 ) = _mm_set1_epi64x( 0x510E527F510E527F );
casti_m128i( ctx->H, 5 ) = _mm_set1_epi64x( 0x9B05688C9B05688C );
casti_m128i( ctx->H, 6 ) = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
@@ -2059,13 +2052,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
else
ctx->T0 -= 512 - bit_len;
buf[vptr] = m128_const1_64( 0x0000008000000080 );
buf[vptr] = _mm_set1_epi64x( 0x0000008000000080 );
if ( vptr < 12 )
{
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
_mm_set1_epi64x( 0x0100000001000000ULL ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -2078,7 +2071,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
ctx->T1 = 0xFFFFFFFFUL;
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
_mm_set1_epi64x( 0x0100000001000000ULL ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf, 64 );
@@ -2097,14 +2090,14 @@ static void
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
{
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527F510E527F );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C9B05688C );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
@@ -2163,7 +2156,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
buf[ptr>>2] = _mm256_set1_epi64x( 0x0000008000000080ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -2185,7 +2178,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
m256_const1_64( 0x0100000001000000ULL ) );
_mm256_set1_epi64x( 0x0100000001000000ULL ) );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -2198,7 +2191,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf, 64 );
@@ -2259,7 +2252,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m256_const1_32( 0x80000000 );
buf[ptr>>2] = _mm256_set1_epi32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -2312,14 +2305,14 @@ static void
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
{
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527F510E527F );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C9B05688C );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
@@ -2376,7 +2369,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
buf[ptr>>2] = _mm512_set1_epi64( 0x0000008000000080ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -2398,7 +2391,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm512_or_si512( buf[52>>2],
m512_const1_64( 0x0100000001000000ULL ) );
_mm512_set1_epi64( 0x0100000001000000ULL ) );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
@@ -2411,7 +2404,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
buf[52>>2] = _mm512_set1_epi64( 0x0100000001000000ULL );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf, 64 );
@@ -2473,7 +2466,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m512_const1_32( 0x80000000 );
buf[ptr>>2] = _mm512_set1_epi32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;

View File

@@ -350,7 +350,6 @@ static const sph_u64 CB[16] = {
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
__m512i shuf_bswap64; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -359,18 +358,16 @@ static const sph_u64 CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m512_const1_64( CB0 ); \
V9 = m512_const1_64( CB1 ); \
VA = m512_const1_64( CB2 ); \
VB = m512_const1_64( CB3 ); \
V8 = _mm512_set1_epi64( CB0 ); \
V9 = _mm512_set1_epi64( CB1 ); \
VA = _mm512_set1_epi64( CB2 ); \
VB = _mm512_set1_epi64( CB3 ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -419,7 +416,6 @@ void blake512_8way_compress( blake_8way_big_context *sc )
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i shuf_bswap64;
V0 = sc->H[0];
V1 = sc->H[1];
@@ -429,19 +425,17 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
0x28292a2b2c2d2e2f, 0x2021222324252627,
0x18191a1b1c1d1e1f, 0x1011121314151617,
0x08090a0b0c0d0e0f, 0x0001020304050607 );
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
@@ -503,10 +497,10 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
@@ -565,23 +559,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_zero;
sc->buf[15] = m512_const1_64( 80*8 );
sc->buf[15] = _mm512_set1_epi64( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -592,10 +586,10 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
@@ -790,14 +784,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -861,7 +855,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = m512_const1_64( 0x80 );
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -882,9 +876,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
{
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm512_or_si512( buf[104>>3],
m512_const1_64( 0x0100000000000000ULL ) );
buf[112>>3] = m512_const1_64( bswap_64( th ) );
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
_mm512_set1_epi64( 0x0100000000000000ULL ) );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -896,9 +890,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_512( buf, 112>>3 );
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
buf[112>>3] = m512_const1_64( bswap_64( th ) );
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
blake64_8way( sc, buf, 128 );
}
@@ -912,14 +906,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -943,7 +937,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m512_const1_64( 0x80 );
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -961,9 +955,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
sc->buf[14] = m512_const1_64( bswap_64( th ) );
sc->buf[15] = m512_const1_64( bswap_64( tl ) );
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -979,14 +973,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1010,7 +1004,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -1029,8 +1023,8 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_const1_64( th );
sc->buf[15] = m512_const1_64( tl );
sc->buf[14] = _mm512_set1_epi64( th );
sc->buf[15] = _mm512_set1_epi64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -1092,7 +1086,6 @@ blake512_8way_close(void *cc, void *dst)
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
__m256i shuf_bswap64; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -1101,16 +1094,16 @@ blake512_8way_close(void *cc, void *dst)
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m256_const1_64( CB0 ); \
V9 = m256_const1_64( CB1 ); \
VA = m256_const1_64( CB2 ); \
VB = m256_const1_64( CB3 ); \
V8 = _mm256_set1_epi64x( CB0 ); \
V9 = _mm256_set1_epi64x( CB1 ); \
VA = _mm256_set1_epi64x( CB2 ); \
VB = _mm256_set1_epi64x( CB3 ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -1160,7 +1153,6 @@ void blake512_4way_compress( blake_4way_big_context *sc )
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i shuf_bswap64;
V0 = sc->H[0];
V1 = sc->H[1];
@@ -1170,20 +1162,20 @@ void blake512_4way_compress( blake_4way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
m256_const1_64( CB4 ) );
_mm256_set1_epi64x( CB4 ) );
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
m256_const1_64( CB5 ) );
_mm256_set1_epi64x( CB5 ) );
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
m256_const1_64( CB6 ) );
_mm256_set1_epi64x( CB6 ) );
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
m256_const1_64( CB7 ) );
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
0x08090a0b0c0d0e0f, 0x0001020304050607 );
_mm256_set1_epi64x( CB7 ) );
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
@@ -1236,23 +1228,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[14] = m256_zero;
sc->buf[15] = m256_const1_64( 80*8 );
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -1263,10 +1255,10 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
@@ -1446,14 +1438,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1513,7 +1505,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = m256_const1_64( 0x80 );
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -1535,9 +1527,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
{
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm256_or_si256( buf[104>>3],
m256_const1_64( 0x0100000000000000ULL ) );
buf[112>>3] = m256_const1_64( bswap_64( th ) );
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -1549,9 +1541,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( buf, 112>>3 );
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
buf[112>>3] = m256_const1_64( bswap_64( th ) );
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}
@@ -1565,14 +1557,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
// init
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1596,7 +1588,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m256_const1_64( 0x80 );
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( sc->ptr == 0 )
@@ -1613,9 +1605,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
sc->buf[14] = m256_const1_64( bswap_64( th ) );
sc->buf[15] = m256_const1_64( bswap_64( tl ) );
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;