This commit is contained in:
Jay D Dee
2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions

View File

@@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
if (opt_benchmark)
HTarget = 0x7f;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

View File

@@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = {
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shufll_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shuflr_32( V2 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
V0 = v128_shufll32( V0 ); \
V3 = v128_swap64( V3 ); \
V2 = v128_shuflr32( V2 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ), \
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ), \
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shuflr_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shufll_32( V2 ); \
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
V0 = v128_shuflr32( V0 ); \
V3 = v128_swap64( V3 ); \
V2 = v128_shufll32( V2 ); \
}
// Default is 14 rounds, blakecoin & vanilla are 8.
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds )
{
__m128i V0, V1, V2, V3;
v128_t V0, V1, V2, V3;
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m128i( H, 0 );
V1 = casti_m128i( H, 1 );
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
V0 = casti_v128( H, 0 );
V1 = casti_v128( H, 1 );
V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
M0 = buf[ 0];
M1 = buf[ 1];
@@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
}
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) );
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) );
}
////////////////////////////////////////////
@@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( v128_32( c1 ), m0 ) ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( v128_32( c0 ), m1 ) ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
a = v128_add32( v128_add32( a, b ), \
v128_xor( v128_32( c1 ), m0 ) ); \
d = v128_swap32_16( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 12 ); \
a = v128_add32( v128_add32( a, b ), \
v128_xor( v128_32( c0 ), m1 ) ); \
d = v128_shuflr32_8( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 7 ); \
}
#define ROUND_S_4WAY(r) \
@@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
}
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
uint32_t T0, T1;
#define READ_STATE32_4WAY(state) do { \
H0 = casti_m128i( state->H, 0 ); \
H1 = casti_m128i( state->H, 1 ); \
H2 = casti_m128i( state->H, 2 ); \
H3 = casti_m128i( state->H, 3 ); \
H4 = casti_m128i( state->H, 4 ); \
H5 = casti_m128i( state->H, 5 ); \
H6 = casti_m128i( state->H, 6 ); \
H7 = casti_m128i( state->H, 7 ); \
H0 = casti_v128( state->H, 0 ); \
H1 = casti_v128( state->H, 1 ); \
H2 = casti_v128( state->H, 2 ); \
H3 = casti_v128( state->H, 3 ); \
H4 = casti_v128( state->H, 4 ); \
H5 = casti_v128( state->H, 5 ); \
H6 = casti_v128( state->H, 6 ); \
H7 = casti_v128( state->H, 7 ); \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE32_4WAY(state) do { \
casti_m128i( state->H, 0 ) = H0; \
casti_m128i( state->H, 1 ) = H1; \
casti_m128i( state->H, 2 ) = H2; \
casti_m128i( state->H, 3 ) = H3; \
casti_m128i( state->H, 4 ) = H4; \
casti_m128i( state->H, 5 ) = H5; \
casti_m128i( state->H, 6 ) = H6; \
casti_m128i( state->H, 7 ) = H7; \
casti_v128( state->H, 0 ) = H0; \
casti_v128( state->H, 1 ) = H1; \
casti_v128( state->H, 2 ) = H2; \
casti_v128( state->H, 3 ) = H3; \
casti_v128( state->H, 4 ) = H4; \
casti_v128( state->H, 5 ) = H5; \
casti_v128( state->H, 6 ) = H6; \
casti_v128( state->H, 7 ) = H7; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
@@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
@@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
M0 = mm128_bswap_32( buf[0] ); \
M1 = mm128_bswap_32( buf[1] ); \
M2 = mm128_bswap_32( buf[2] ); \
M3 = mm128_bswap_32( buf[3] ); \
M4 = mm128_bswap_32( buf[4] ); \
M5 = mm128_bswap_32( buf[5] ); \
M6 = mm128_bswap_32( buf[6] ); \
M7 = mm128_bswap_32( buf[7] ); \
M8 = mm128_bswap_32( buf[8] ); \
M9 = mm128_bswap_32( buf[9] ); \
MA = mm128_bswap_32( buf[10] ); \
MB = mm128_bswap_32( buf[11] ); \
MC = mm128_bswap_32( buf[12] ); \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
M0 = v128_bswap32( buf[0] ); \
M1 = v128_bswap32( buf[1] ); \
M2 = v128_bswap32( buf[2] ); \
M3 = v128_bswap32( buf[3] ); \
M4 = v128_bswap32( buf[4] ); \
M5 = v128_bswap32( buf[5] ); \
M6 = v128_bswap32( buf[6] ); \
M7 = v128_bswap32( buf[7] ); \
M8 = v128_bswap32( buf[8] ); \
M9 = v128_bswap32( buf[9] ); \
MA = v128_bswap32( buf[10] ); \
MB = v128_bswap32( buf[11] ); \
MC = v128_bswap32( buf[12] ); \
MD = v128_bswap32( buf[13] ); \
ME = v128_bswap32( buf[14] ); \
MF = v128_bswap32( buf[15] ); \
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4WAY( rounds ) \
{ \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
v128_t V0, V1, V2, V3, V4, V5, V6, V7; \
v128_t V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \
H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \
H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \
H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \
H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \
H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
}
#if defined (__AVX2__)
@@ -1867,14 +1867,14 @@ static void
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
@@ -1884,7 +1884,7 @@ static void
blake32_4way( blake_4way_small_context *ctx, const void *data,
size_t len )
{
__m128i *buf = (__m128i*)ctx->buf;
v128_t *buf = (v128_t*)ctx->buf;
size_t bptr = ctx->ptr<<2;
size_t vptr = ctx->ptr >> 2;
size_t blen = len << 2;
@@ -1925,7 +1925,7 @@ static void
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m128i buf[16] __attribute__ ((aligned (64)));
v128_t buf[16] __attribute__ ((aligned (64)));
size_t ptr = ctx->ptr;
size_t vptr = ctx->ptr>>2;
unsigned bit_len = ( (unsigned)ptr << 3 );
@@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
if ( vptr < 12 )
{
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
v128_memset_zero( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
buf[ 14 ] = v128_32( bswap_32( th ) );
buf[ 15 ] = v128_32( bswap_32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
}
else
{
memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
blake32_4way( ctx, buf + vptr, 64 - ptr );
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
v128_memset_zero( buf, 56>>2 );
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
buf[ 14 ] = v128_32( bswap_32( th ) );
buf[ 15 ] = v128_32( bswap_32( tl ) );
blake32_4way( ctx, buf, 64 );
}
mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
}
#if defined (__AVX2__)

View File

@@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
v128_bswap32_80( endiandata, pdata );
do {
endiandata[19] = n;

View File

@@ -12,13 +12,13 @@
*/
#include "blake2s-hash.h"
#include "simd-utils.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
/*
static const uint32_t blake2s_IV[8] =
@@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) );
return 0;
}
int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block )
{
__m128i m[16];
__m128i v[16];
v128_t m[16];
v128_t v[16];
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v128_memcpy( m, block, 16 );
v128_memcpy( v, S->h, 8 );
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
v[12] = v128_xor( v128_32( S->t[0] ),
v128_64( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
v[13] = v128_xor( v128_32( S->t[1] ),
v128_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
v[14] = v128_xor( v128_32( S->f[0] ),
v128_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
v[15] = v128_xor( v128_32( S->f[1] ),
v128_64( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
d = v128_swap32_16( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 12 ); \
a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
d = v128_shuflr32_8( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 7 ); \
} while(0)
@@ -143,7 +143,7 @@ do { \
ROUND4W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] );
#undef G4W
#undef ROUND4W
@@ -175,26 +175,26 @@ do { \
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen )
{
__m128i *input = (__m128i*)in;
__m128i *buf = (__m128i*)S->buf;
v128_t *input = (v128_t*)in;
v128_t *buf = (v128_t*)S->buf;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= BLAKE2S_BLOCKBYTES - left )
if( inlen >= 64 - left )
{
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
S->buflen += BLAKE2S_BLOCKBYTES - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 );
S->buflen += 64 - left;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_4way_compress( S, buf );
S->buflen = 0;
input += ( BLAKE2S_BLOCKBYTES >> 2 );
inlen -= BLAKE2S_BLOCKBYTES;
input += ( 64 >> 2 );
inlen -= 64;
}
else
{
memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
v128_memcpy( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
@@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in,
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
{
__m128i *buf = (__m128i*)S->buf;
v128_t *buf = (v128_t*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
@@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_128( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
v128_memset_zero( buf + ( S->buflen>>2 ),
( 64 - S->buflen ) >> 2 );
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
casti_v128( out, i ) = S->h[ i ];
return 0;
}
@@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
const void *input, uint64_t inlen )
{
__m128i *in = (__m128i*)input;
__m128i *buf = (__m128i*)S->buf;
v128_t *in = (v128_t*)input;
v128_t *buf = (v128_t*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
while( inlen > 64 )
{
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
v128_memcpy( buf, in, 64 >> 2 );
S->buflen = 64;
inlen -= 64;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_4way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
in += ( 64 >> 2 );
}
// last block
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
v128_memcpy( buf, in, 64 >> 2 );
S->buflen = 64;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
@@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
casti_v128( out, i ) = S->h[ i ];
return 0;
}
@@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
{
__m256i *input = (__m256i*)in;
__m256i *buf = (__m256i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
const int bsize = 64;
while( inlen > 0 )
{
@@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
{
memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_8way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
@@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_256( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 );
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
@@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
__m256i *in = (__m256i*)input;
__m256i *buf = (__m256i*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
while( inlen > 64 )
{
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
memcpy_256( buf, in, 64 >> 2 );
S->buflen = 64;
inlen -= 64;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_8way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
in += ( 64 >> 2 );
}
// last block
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
memcpy_256( buf, in, 64 >> 2 );
S->buflen = 64;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
@@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
{
__m512i *input = (__m512i*)in;
__m512i *buf = (__m512i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
const int bsize = 64;
while( inlen > 0 )
{
@@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
{
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_16way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
@@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
S->f[0] = ~0U;
memset_zero_512( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
( 64 - S->buflen ) >> 2 );
blake2s_16way_compress( S, buf );
for ( int i = 0; i < 8; ++i )

View File

@@ -14,7 +14,7 @@
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
#include "simd-utils.h"
@@ -29,41 +29,25 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_nway_param;
#pragma pack(pop)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
__m128i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
v128_t h[8];
uint8_t buf[ 64 * 4 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
uint8_t buf[ 32 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
uint8_t buf[ 32 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
#endif
#if defined(__cplusplus)
}
#endif
#endif // __SSE2__
#endif

View File

@@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input )
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_16way_final( &ctx, output, 32 );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
@@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_init( &blake2s_16w_ctx, 32 );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
@@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input )
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_8way_final( &ctx, output, 32 );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_init( &blake2s_8w_ctx, 32 );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
@@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input )
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_4way_final( &ctx, output, 32 );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, 32 );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
@@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx;
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
unsigned char _ALIGN(32) hash[32];
blake2s_state ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
blake2s_final( &ctx, hash, 32 );
memcpy(output, hash, 32);
}
@@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce,
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) endiandata[20];
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
v128_bswap32_80( endiandata, pdata );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_init( &blake2s_ctx, 32 );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do

View File

@@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
}
#define BLAKE512_ROUND( R ) \
{ \
__m128i V32, V23, V67, V76; \
v128_t V32, V23, V67, V76; \
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
V32 = v128_alignr64( V[3], V[2], 1 ); \
V23 = v128_alignr64( V[2], V[3], 1 ); \
V67 = v128_alignr64( V[6], V[7], 1 ); \
V76 = v128_alignr64( V[7], V[6], 1 ); \
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
V[2] = mm128_alignr_64( V32, V23, 1 ); \
V[3] = mm128_alignr_64( V23, V32, 1 ); \
V[6] = mm128_alignr_64( V76, V67, 1 ); \
V[7] = mm128_alignr_64( V67, V76, 1 ); \
V[2] = v128_alignr64( V32, V23, 1 ); \
V[3] = v128_alignr64( V23, V32, 1 ); \
V[6] = v128_alignr64( V76, V67, 1 ); \
V[7] = v128_alignr64( V67, V76, 1 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 )
{
__m128i V[8];
v128_t V[8];
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V[0] = casti_m128i( H, 0 );
V[1] = casti_m128i( H, 1 );
V[2] = casti_m128i( H, 2 );
V[3] = casti_m128i( H, 3 );
V[4] = _mm_set_epi64x( CB1, CB0 );
V[5] = _mm_set_epi64x( CB3, CB2 );
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
V[0] = casti_v128( H, 0 );
V[1] = casti_v128( H, 1 );
V[2] = casti_v128( H, 2 );
V[3] = casti_v128( H, 3 );
V[4] = v128_set_64( CB1, CB0 );
V[5] = v128_set_64( CB3, CB2 );
V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 );
V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
@@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf,
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) );
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) );
casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) );
casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) );
}
#endif
@@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
VD = v512_64( T0 ^ CB5 ); \
VE = v512_64( T1 ^ CB6 ); \
VF = v512_64( T1 ^ CB7 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
@@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst)
VD = v256_64( T0 ^ CB5 ); \
VE = v256_64( T1 ^ CB6 ); \
VF = v256_64( T1 ^ CB7 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc )
v256_64( CB6 ) );
VF = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB7 ) );
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );

View File

@@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
HTarget = 0x7f;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

View File

@@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_
return 0;
}
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] )
{
memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
memcpy( P->salt, salt, 8 );
return 0;
}
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] )
{
memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
memcpy( P->personal, personal, 8 );
return 0;
}
@@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
blake2s_param P[1];
/* Move interval verification here? */
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
P->digest_length = outlen;
P->key_length = 0;
@@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
{
blake2s_param P[1];
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
if ( !key || !keylen || keylen > 8 ) return -1;
P->digest_length = outlen;
P->key_length = keylen;
@@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
if( blake2s_init_param( S, P ) < 0 ) return -1;
{
uint8_t block[BLAKE2S_BLOCKBYTES];
memset( block, 0, BLAKE2S_BLOCKBYTES );
uint8_t block[64];
memset( block, 0, 64 );
memcpy( block, key, keylen );
blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
blake2s_update( S, block, 64 );
secure_zero_memory( block, 64 ); /* Burn the key from stack */
}
return 0;
}
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
{
uint32_t _ALIGN(32) m[16];
uint32_t _ALIGN(32) v[16];
@@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
while( inlen > 0 )
{
size_t left = S->buflen;
size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
size_t fill = 2 * 64 - left;
if( inlen > fill )
{
memcpy( S->buf + left, in, fill ); // Fill buffer
S->buflen += fill;
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
blake2s_increment_counter( S, 64 );
blake2s_compress( S, S->buf ); // Compress
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
S->buflen -= BLAKE2S_BLOCKBYTES;
memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left
S->buflen -= 64;
in += fill;
inlen -= fill;
}
@@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
{
uint8_t buffer[BLAKE2S_OUTBYTES];
uint8_t buffer[32];
if( S->buflen > BLAKE2S_BLOCKBYTES )
if( S->buflen > 64 )
{
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
blake2s_increment_counter( S, 64 );
blake2s_compress( S, S->buf );
S->buflen -= BLAKE2S_BLOCKBYTES;
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
S->buflen -= 64;
memcpy( S->buf, S->buf + 64, S->buflen );
}
blake2s_increment_counter( S, ( uint32_t )S->buflen );
blake2s_set_lastblock( S );
memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */
blake2s_compress( S, S->buf );
for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
#include "blake2-kat.h" /* test data not included */
int main( int argc, char **argv )
{
uint8_t key[BLAKE2S_KEYBYTES];
uint8_t key[8];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
for( size_t i = 0; i < 8; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
@@ -419,10 +419,10 @@ int main( int argc, char **argv )
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2S_OUTBYTES];
blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
uint8_t hash[32];
blake2s( hash, buf, key, 32, i, );
if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) )
{
puts( "error" );
return -1;

View File

@@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n)
/* blake2.h */
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_param
{
@@ -112,8 +99,8 @@ extern "C" {
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_param;
typedef struct ALIGN( 64 ) __blake2s_state
@@ -121,13 +108,13 @@ extern "C" {
uint32_t h[8];
uint32_t t[2];
uint32_t f[2];
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
uint8_t buf[2 * 64];
size_t buflen;
uint8_t last_node;
} blake2s_state ;
#pragma pack(pop)
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
int blake2s_compress( blake2s_state *S, const uint8_t block[64] );
// Streaming API
int blake2s_init( blake2s_state *S, const uint8_t outlen );

View File

@@ -95,6 +95,43 @@
}
*/
#elif defined(__SSE2__) || defined(__NEON__) // ready for NEON
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = v128_alignr64( V[3], V[2], 1 ); \
V3 = v128_alignr64( V[2], V[3], 1 ); \
V6 = v128_alignr64( V[6], V[7], 1 ); \
V7 = v128_alignr64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = v128_alignr64( V2, V3, 1 ); \
V[3] = v128_alignr64( V3, V2, 1 ); \
V[6] = v128_alignr64( V7, V6, 1 ); \
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
@@ -131,6 +168,7 @@
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available