Compare commits

...

4 Commits

Author SHA1 Message Date
Jay D Dee
1321ac474c v3.20.1 2022-07-26 18:36:40 -04:00
Jay D Dee
40d07c0097 v3.20.0 2022-07-17 13:30:50 -04:00
Jay D Dee
f552f2b1e8 v3.19.9 2022-07-10 11:04:00 -04:00
Jay D Dee
26b8927632 v3.19.8 2022-05-27 18:12:30 -04:00
50 changed files with 2928 additions and 2961 deletions

View File

@@ -289,7 +289,7 @@ cpuminer_SOURCES = \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/crypto/hmac-blake2b.c \
algo/yespower/yescrypt-r8g.c \
algo/yespower/yespower-opt.c

View File

@@ -22,7 +22,7 @@ required.
Compile Instructions
--------------------
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
@@ -65,6 +65,36 @@ If not what makes it happen or not happen?
Change Log
----------
v3.20.1
sph_blake2b optimized 1-way SSSE3 & AVX2.
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
Removed imprecise hash & target display from rejected share log.
Share and target difficulty is now displayed only for low diificulty shares.
Updated configure.ac to check for AVX512 asm support.
Small optimization to Lyra2 SSE2.
v3.20.0
#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
v3.19.9
More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
Relaxed some excessively strict data alignment that was negatively affecting performance.
v3.19.8
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
url protocol specifier for requesting a secure stratum connection.
The full url, including the protocol, is now displayed in the stratum connect
log and the periodic summary log.
Small optimizations to Cubehash, AVX2 & AVX512.
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
v3.19.7
#369 Fixed time limited mining, --time-limit.

View File

@@ -49,6 +49,20 @@ extern "C"{
#define SPH_SIZE_blake512 512
/////////////////////////
//
// Blake-256 1 way SSE2
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 );
/////////////////////////
//
// Blake-512 1 way SSE2
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
//////////////////////////
//
// Blake-256 4 way SSE2
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_hash_le80( void *hash, const void *data );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2

View File

@@ -5,6 +5,7 @@
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
* 2016-2022 JayDDee246@gmail.com
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {
#endif
/////////////////////////////////////////
//
// Blake-256 1 way SIMD
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shufll_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shuflr_32( V1 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shuflr_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shufll_32( V1 ); \
}
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 )
{
__m128i V0, V1, V2, V3;
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m128i( H, 0 );
V1 = casti_m128i( H, 1 );
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
M0 = buf[ 0];
M1 = buf[ 1];
M2 = buf[ 2];
M3 = buf[ 3];
M4 = buf[ 4];
M5 = buf[ 5];
M6 = buf[ 6];
M7 = buf[ 7];
M8 = buf[ 8];
M9 = buf[ 9];
MA = buf[10];
MB = buf[11];
MC = buf[12];
MD = buf[13];
ME = buf[14];
MF = buf[15];
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
BLAKE256_ROUND( 4 );
BLAKE256_ROUND( 5 );
BLAKE256_ROUND( 6 );
BLAKE256_ROUND( 7 );
BLAKE256_ROUND( 8 );
BLAKE256_ROUND( 9 );
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
}
////////////////////////////////////////////
//
// Blake-256 4 way
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -508,14 +601,10 @@ do { \
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
VA = m128_const1_64( 0x13198A2E13198A2E ); \
VB = m128_const1_64( 0x0370734403707344 ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \
BLAKE256_4WAY_BLOCK_BSWAP32; \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
@@ -548,6 +637,8 @@ do { \
#if defined (__AVX2__)
/////////////////////////////////
//
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -626,14 +717,10 @@ do { \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
@@ -679,13 +766,247 @@ do { \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
#define COMPRESS32_8WAY_LE( rounds ) \
do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
ROUND_S_8WAY(4); \
ROUND_S_8WAY(5); \
ROUND_S_8WAY(6); \
ROUND_S_8WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_8WAY(8); \
ROUND_S_8WAY(9); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
} \
H0 = mm256_xor3( V8, V0, H0 ); \
H1 = mm256_xor3( V9, V1, H1 ); \
H2 = mm256_xor3( VA, V2, H2 ); \
H3 = mm256_xor3( VB, V3, H3 ); \
H4 = mm256_xor3( VC, V4, H4 ); \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
{
const __m256i *M = (const __m256i*)data;
__m256i *V = (__m256i*)midstate;
const __m256i *H = (const __m256i*)midhash;
V[ 0] = H[0];
V[ 1] = H[1];
V[ 2] = H[2];
V[ 3] = H[3];
V[ 4] = H[4];
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m256_const1_32( CS0 );
V[ 9] = m256_const1_32( CS1 );
V[10] = m256_const1_32( CS2 );
V[11] = m256_const1_32( CS3 );
V[12] = m256_const1_32( CS4 ^ 0x280 );
V[13] = m256_const1_32( CS5 ^ 0x280 );
V[14] = m256_const1_32( CS6 );
V[15] = m256_const1_32( CS7 );
// G0
GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
// G1
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
_mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G4
V[ 0] = _mm256_add_epi32( V[ 0],
_mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
// G6
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
_mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
// G7
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
V[ 3] = _mm256_add_epi32( V[ 3],
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
{
__m256i *H = (__m256i*)final_hash;
const __m256i *h = (const __m256i*)midhash;
const __m256i *v= (const __m256i*)midstate;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
M0 = casti_m256i( data, 0 );
M1 = casti_m256i( data, 1 );
M2 = casti_m256i( data, 2 );
M3 = casti_m256i( data, 3 );
M4 = casti_m256i( data, 4 );
M5 = casti_m256i( data, 5 );
M6 = casti_m256i( data, 6 );
M7 = casti_m256i( data, 7 );
M8 = casti_m256i( data, 8 );
M9 = casti_m256i( data, 9 );
MA = casti_m256i( data, 10 );
MB = casti_m256i( data, 11 );
MC = casti_m256i( data, 12 );
MD = casti_m256i( data, 13 );
ME = casti_m256i( data, 14 );
MF = casti_m256i( data, 15 );
// Finish round 0
// G1
V1 = _mm256_add_epi32( V1,
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
V9 = _mm256_add_epi32( V9, VD );
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
// G4
V0 = _mm256_add_epi32( V0, V5 );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
// G5
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// G6
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
// G7
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
V3 = _mm256_add_epi32( V3, V4 );
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
ROUND_S_8WAY( 4 );
ROUND_S_8WAY( 5 );
ROUND_S_8WAY( 6 );
ROUND_S_8WAY( 7 );
ROUND_S_8WAY( 8 );
ROUND_S_8WAY( 9 );
ROUND_S_8WAY( 0 );
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
const __m256i shuf_bswap32 =
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Blaske-256 16 way AVX512
///////////////////////////////////////
//
// Blake-256 16 way AVX512
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
@@ -763,14 +1084,10 @@ do { \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
m512_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
m512_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
m512_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
@@ -818,6 +1135,264 @@ do { \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
#define COMPRESS32_16WAY_LE( rounds ) \
do { \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \
ROUND_S_16WAY(4); \
ROUND_S_16WAY(5); \
ROUND_S_16WAY(6); \
ROUND_S_16WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_16WAY(8); \
ROUND_S_16WAY(9); \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \
} \
H0 = mm512_xor3( V8, V0, H0 ); \
H1 = mm512_xor3( V9, V1, H1 ); \
H2 = mm512_xor3( VA, V2, H2 ); \
H3 = mm512_xor3( VB, V3, H3 ); \
H4 = mm512_xor3( VC, V4, H4 ); \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
// Blake-256 prehash of the second block is split onto 2 parts. The first part
// is constant for every nonce and only needs to be run once per job. The
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
{
const __m512i *M = (const __m512i*)data;
__m512i *V = (__m512i*)midstate;
const __m512i *H = (const __m512i*)midhash;
V[ 0] = H[0];
V[ 1] = H[1];
V[ 2] = H[2];
V[ 3] = H[3];
V[ 4] = H[4];
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m512_const1_32( CS0 );
V[ 9] = m512_const1_32( CS1 );
V[10] = m512_const1_32( CS2 );
V[11] = m512_const1_32( CS3 );
V[12] = m512_const1_32( CS4 ^ 0x280 );
V[13] = m512_const1_32( CS5 ^ 0x280 );
V[14] = m512_const1_32( CS6 );
V[15] = m512_const1_32( CS7 );
// G0
GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
// G1, nonce is in M[3]
// GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
_mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
V[ 9] = _mm512_add_epi32( V[ 9], V[13] );
V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G4
// GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
V[ 0] = _mm512_add_epi32( V[ 0],
_mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) );
// G5
// GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
// G6
// GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
_mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
// G7
// GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
_mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
V[ 3] = _mm512_add_epi32( V[ 3],
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
}
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
{
__m512i *H = (__m512i*)final_hash;
const __m512i *h = (const __m512i*)midhash;
const __m512i *v= (const __m512i*)midstate;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
M0 = casti_m512i( data, 0 );
M1 = casti_m512i( data, 1 );
M2 = casti_m512i( data, 2 );
M3 = casti_m512i( data, 3 );
M4 = casti_m512i( data, 4 );
M5 = casti_m512i( data, 5 );
M6 = casti_m512i( data, 6 );
M7 = casti_m512i( data, 7 );
M8 = casti_m512i( data, 8 );
M9 = casti_m512i( data, 9 );
MA = casti_m512i( data, 10 );
MB = casti_m512i( data, 11 );
MC = casti_m512i( data, 12 );
MD = casti_m512i( data, 13 );
ME = casti_m512i( data, 14 );
MF = casti_m512i( data, 15 );
// Finish round 0 with the nonce (M3) now available
// G0
// GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );
// G1
// GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
V1 = _mm512_add_epi32( V1,
_mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
V9 = _mm512_add_epi32( V9, VD );
V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 );
// G2,G3
// GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE );
// GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF );
// G4
// GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
V0 = _mm512_add_epi32( V0, V5 );
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
_mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
// G5
GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// G6
// GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
_mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
// G7
// GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
V9 = _mm512_add_epi32( V9, VE );
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
V3 = _mm512_add_epi32( V3, V4 );
VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
V9 = _mm512_add_epi32( V9, VE );
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
ROUND_S_16WAY( 4 );
ROUND_S_16WAY( 5 );
ROUND_S_16WAY( 6 );
ROUND_S_16WAY( 7 );
ROUND_S_16WAY( 8 );
ROUND_S_16WAY( 9 );
ROUND_S_16WAY( 0 );
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
// Byte swap final hash
const __m512i shuf_bswap32 =
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
0x2c2d2e2f28292a2b, 0x2425262720212223,
0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
}
#endif
// Blake-256 4 way
@@ -913,8 +1488,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
}
else
@@ -926,8 +1501,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf, 64 );
}
@@ -1033,22 +1608,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
m256_const1_64( 0x0100000001000000ULL ) );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf, 64 );
}
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
}
static void
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_8WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_8WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_8WAY_LE( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_8WAY(sc);
sc->ptr = ptr;
}
static void
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m256_const1_32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 52 )
{
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 );
*(buf+(56>>2)) = _mm256_set1_epi32( th );
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_one_32;
*(buf+(56>>2)) = _mm256_set1_epi32( th );
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
blake32_8way_le( sc, buf, 64 );
}
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
}
@@ -1117,7 +1787,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
WRITE_STATE32_16WAY(sc);
sc->ptr = ptr;
}
static void
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
@@ -1152,22 +1821,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
if ( out_size_w32 == 8 )
buf[52>>2] = _mm512_or_si512( buf[52>>2],
m512_const1_64( 0x0100000001000000ULL ) );
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf, 64 );
}
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
}
static void
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_16WAY
buf = sc->buf;
ptr = sc->ptr;
// only if calling update with 80
if ( len < buf_size - ptr )
{
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_16WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = T0 + 512 ) < 512 )
T1 = T1 + 1;
COMPRESS32_16WAY_LE( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_16WAY(sc);
sc->ptr = ptr;
}
static void
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m512_const1_32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
blake32_16way( sc, buf, 64 );
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFE00UL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 52 )
{
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 );
buf[56>>2] = _mm512_set1_epi32( th );
buf[60>>2] = _mm512_set1_epi32( tl );
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
buf[52>>2] = m512_one_32;
buf[56>>2] = _mm512_set1_epi32( th );
buf[60>>2] = _mm512_set1_epi32( tl );
blake32_16way_le( sc, buf, 64 );
}
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
}
@@ -1190,6 +1953,18 @@ blake256_16way_close(void *cc, void *dst)
blake32_16way_close(cc, 0, 0, dst, 8);
}
void
blake256_16way_update_le(void *cc, const void *data, size_t len)
{
blake32_16way_le(cc, data, len);
}
void
blake256_16way_close_le(void *cc, void *dst)
{
blake32_16way_close_le(cc, 0, 0, dst, 8);
}
void blake256r14_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
@@ -1271,6 +2046,18 @@ blake256_8way_close(void *cc, void *dst)
blake32_8way_close(cc, 0, 0, dst, 8);
}
void
blake256_8way_update_le(void *cc, const void *data, size_t len)
{
blake32_8way_le(cc, data, len);
}
void
blake256_8way_close_le(void *cc, void *dst)
{
blake32_8way_close_le(cc, 0, 0, dst, 8);
}
#endif
// 14 rounds Blake, Decred

View File

@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
};
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define B2B8W_G(a, b, c, d, x, y) \

View File

@@ -361,14 +361,10 @@ static const sph_u64 CB[16] = {
V9 = m512_const1_64( CB1 ); \
VA = m512_const1_64( CB2 ); \
VB = m512_const1_64( CB3 ); \
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB4 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB5 ) ); \
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB6 ) ); \
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB7 ) ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB4 ) );
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB5 ) );
VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB6 ) );
VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB7 ) );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
0x28292a2b2c2d2e2f, 0x2021222324252627,
@@ -493,6 +485,307 @@ void blake512_8way_compress( blake_8way_big_context *sc )
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8way_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = sc->buf[ 9];
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// with final_le forms a full hash in 2 parts from little endian data.
// all variables hard coded for 80 bytes/lane.
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data )
{
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_zero;
sc->buf[15] = m512_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
VF = _mm512_set1_epi64( CB7 );
// round 0
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// Do half of G4, skip the nonce
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
V0 = _mm512_add_epi64( V0, V5 );
GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
sc->buf[ 4] ) );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
V2 = _mm512_add_epi64( V2, V6 );
// G3
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
// pick up where we left off, need the nonce now.
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0 with the nonce now available
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
_mm512_set1_epi64( CB8 ), M9 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
// Round 1
// G0
GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
// G3
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
// G4, G5, G6, G7
GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
// remaining rounds
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
h[0] = mm512_xor3( V8, V0, sc->H[0] );
h[1] = mm512_xor3( V9, V1, sc->H[1] );
h[2] = mm512_xor3( VA, V2, sc->H[2] );
h[3] = mm512_xor3( VB, V3, sc->H[3] );
h[4] = mm512_xor3( VC, V4, sc->H[4] );
h[5] = mm512_xor3( VD, V5, sc->H[5] );
h[6] = mm512_xor3( VE, V6, sc->H[6] );
h[7] = mm512_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm512_block_bswap_64( (__m512i*)hash, h );
}
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -678,6 +971,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len )
{
// init
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
// update
memcpy_512( sc->buf, (__m512i*)data, len>>3 );
sc->ptr = len;
if ( len == 128 )
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
sc->ptr = 0;
}
// close
size_t ptr64 = sc->ptr >> 3;
unsigned bit_len;
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr64 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_const1_64( th );
sc->buf[15] = m512_const1_64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void
blake512_8way_update(void *cc, const void *data, size_t len)
{
@@ -741,14 +1101,10 @@ blake512_8way_close(void *cc, void *dst)
V9 = m256_const1_64( CB1 ); \
VA = m256_const1_64( CB2 ); \
VB = m256_const1_64( CB3 ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB7 ) ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -869,6 +1225,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[14] = m256_zero;
sc->buf[15] = m256_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
VF = _mm256_set1_epi64x( CB7 );
// round 0
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
V0 = _mm256_add_epi64( V0, V5 );
GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
sc->buf[ 4] ) );
// G2
V2 = _mm256_add_epi64( V2, V6 );
// G3
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
// Round 1
// G0
GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
V1 = _mm256_add_epi64( V1, V5 );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
// G3
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
// G4, G5, G6, G7
GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
ROUND_B_4WAY(6);
ROUND_B_4WAY(7);
ROUND_B_4WAY(8);
ROUND_B_4WAY(9);
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
h[0] = mm256_xor3( V8, V0, sc->H[0] );
h[1] = mm256_xor3( V9, V1, sc->H[1] );
h[2] = mm256_xor3( VA, V2, sc->H[2] );
h[3] = mm256_xor3( VB, V3, sc->H[3] );
h[4] = mm256_xor3( VC, V4, sc->H[4] );
h[5] = mm256_xor3( VD, V5, sc->H[5] );
h[6] = mm256_xor3( VE, V6, sc->H[6] );
h[7] = mm256_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm256_block_bswap_64( (__m256i*)hash, h );
}
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );

View File

@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#define COMPRESS32_LE do { \
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = S0 ^ CS0; \
V9 = S1 ^ CS1; \
VA = S2 ^ CS2; \
VB = S3 ^ CS3; \
VC = T0 ^ CS4; \
VD = T0 ^ CS5; \
VE = T1 ^ CS6; \
VF = T1 ^ CS7; \
M0 = *((uint32_t*)(buf + 0)); \
M1 = *((uint32_t*)(buf + 4)); \
M2 = *((uint32_t*)(buf + 8)); \
M3 = *((uint32_t*)(buf + 12)); \
M4 = *((uint32_t*)(buf + 16)); \
M5 = *((uint32_t*)(buf + 20)); \
M6 = *((uint32_t*)(buf + 24)); \
M7 = *((uint32_t*)(buf + 28)); \
M8 = *((uint32_t*)(buf + 32)); \
M9 = *((uint32_t*)(buf + 36)); \
MA = *((uint32_t*)(buf + 40)); \
MB = *((uint32_t*)(buf + 44)); \
MC = *((uint32_t*)(buf + 48)); \
MD = *((uint32_t*)(buf + 52)); \
ME = *((uint32_t*)(buf + 56)); \
MF = *((uint32_t*)(buf + 60)); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
ROUND_S(4); \
ROUND_S(5); \
ROUND_S(6); \
ROUND_S(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S(8); \
ROUND_S(9); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
} \
H0 ^= S0 ^ V0 ^ V8; \
H1 ^= S1 ^ V1 ^ V9; \
H2 ^= S2 ^ V2 ^ VA; \
H3 ^= S3 ^ V3 ^ VB; \
H4 ^= S0 ^ V4 ^ VC; \
H5 ^= S1 ^ V5 ^ VD; \
H6 ^= S2 ^ V6 ^ VE; \
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#endif
#if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
sc->ptr = ptr;
}
static void
blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
{
unsigned char *buf;
size_t ptr;
DECL_STATE32
buf = sc->buf;
ptr = sc->ptr;
if (len < (sizeof sc->buf) - ptr) {
memcpy(buf + ptr, data, len);
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32(sc);
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
ptr += clen;
data = (const unsigned char *)data + clen;
len -= clen;
if (ptr == sizeof sc->buf) {
if ((T0 = SPH_T32(T0 + 512)) < 512)
T1 = SPH_T32(T1 + 1);
COMPRESS32_LE;
ptr = 0;
}
}
WRITE_STATE32(sc);
sc->ptr = ptr;
}
static void
blake32_close(sph_blake_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
blake32(cc, data, len);
}
void
sph_blake256_update_le(void *cc, const void *data, size_t len)
{
blake32_le(cc, data, len);
}
/* see sph_blake.h */
void
sph_blake256_close(void *cc, void *dst)

View File

@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
* @param len the input data length (in bytes)
*/
void sph_blake256(void *cc, const void *data, size_t len);
void sph_blake256_update_le(void *cc, const void *data, size_t len);
/**
* Terminate the current BLAKE-256 computation and output the result into

View File

@@ -30,16 +30,10 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "algo/sha/sph_types.h"
#include "sph_blake2b.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
// Little-endian byte access.
#define B2B_GET64(p) \
@@ -54,45 +48,131 @@
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
#if defined(__AVX2__)
#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
{ \
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
BLAKE2B_G( R, 0, 2, 4, 6, 32, 24 ); \
BLAKE2B_G( R, 1, 3, 5, 7, 16, 63 ); \
V[3] = mm256_shufll_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shuflr_64( V[1] ); \
BLAKE2B_G( R, 8, 10, 12, 14, 32, 24 ); \
BLAKE2B_G( R, 9, 11, 13, 15, 16, 63 ); \
V[3] = mm256_shuflr_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
#elif defined(__SSSE3__)
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
V2 = mm128_shufl2r_64( V[2], V[3] ); \
V3 = mm128_shufl2r_64( V[3], V[2] ); \
V6 = mm128_shufl2l_64( V[6], V[7] ); \
V7 = mm128_shufl2l_64( V[7], V[6] ); \
BLAKE2B_G( R, V[0], V2, V[5], V6, 8, 10, 32, 24 ); \
BLAKE2B_G( R, V[0], V2, V[5], V6, 9, 11, 16, 63 ); \
BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
V[2] = mm128_shufl2l_64( V2, V3 ); \
V[3] = mm128_shufl2l_64( V3, V2 ); \
V[6] = mm128_shufl2r_64( V6, V7 ); \
V[7] = mm128_shufl2r_64( V7, V6 ); \
}
#else
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
{ \
Va = Va + Vb + m[ sigma[R][Sa] ]; \
Vd = ROTR64( Vd ^ Va, 32 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 24 ); \
Va = Va + Vb + m[ sigma[R][Sb] ]; \
Vd = ROTR64( Vd ^ Va, 16 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \
BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \
BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \
BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \
BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \
BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
}
#endif
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
uint64_t v[16] __attribute__ ((aligned (32)));
uint64_t m[16] __attribute__ ((aligned (32)));
int i;
for (i = 0; i < 8; i++) { // init work variables
v[i] = ctx->h[i];
@@ -106,16 +186,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
for (i = 0; i < 16; i++) // get little-endian words
m[i] = B2B_GET64(&ctx->b[8 * i]);
for (i = 0; i < 12; i++) { // twelve rounds
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for (i = 0; i < 12; i++)
BLAKE2B_ROUND( i );
for( i = 0; i < 8; ++i )
ctx->h[i] ^= v[i] ^ v[i + 8];

View File

@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
x0 = mm512_rol_32( x2, 7 );
x1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( x0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
y0 = mm512_rol_32( x2, 7 );
y1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( x0, 7 );
x3 = mm512_rol_32( x1, 7 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( y1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap128_64( x4 );
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x2;
x0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( y0, 11 );
x2 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( y1, 11 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( x2, 11 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x2 = _mm512_xor_si512( y1, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap64_32( x4 );
x5 = mm512_swap64_32( x5 );
@@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp )
{
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
tx0 = x0;
ty0 = y0;
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
tx1 = x1;
ty1 = y1;
x0 = mm512_rol_32( x2, 7 );
y0 = mm512_rol_32( y2, 7 );
tx0 = mm512_rol_32( x2, 7 );
ty0 = mm512_rol_32( y2, 7 );
tx1 = mm512_rol_32( x3, 7 );
ty1 = mm512_rol_32( y3, 7 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
x1 = mm512_rol_32( x3, 7 );
y1 = mm512_rol_32( y3, 7 );
y6 = _mm512_add_epi32( y2, y6 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x2 = mm512_rol_32( tx0, 7 );
y2 = mm512_rol_32( ty0, 7 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = _mm512_xor_si512( y0, y4 );
x2 = mm512_rol_32( x0, 7 );
y2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( x1, 7 );
y3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( tx1, x5 );
y1 = _mm512_xor_si512( ty1, y5 );
x4 = mm512_swap128_64( x4 );
x3 = mm512_rol_32( tx1, 7 );
y3 = mm512_rol_32( ty1, 7 );
y4 = mm512_swap128_64( y4 );
x1 = _mm512_xor_si512( x1, x5 );
y1 = _mm512_xor_si512( y1, y5 );
x5 = mm512_swap128_64( x5 );
y5 = mm512_swap128_64( y5 );
x2 = _mm512_xor_si512( x2, x6 );
y2 = _mm512_xor_si512( y2, y6 );
y5 = mm512_swap128_64( y5 );
x3 = _mm512_xor_si512( x3, x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap128_64( x6 );
y6 = mm512_swap128_64( y6 );
x7 = mm512_swap128_64( x7 );
y7 = mm512_swap128_64( y7 );
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
y6 = mm512_swap128_64( y6 );
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
x7 = mm512_swap128_64( x7 );
tx0 = mm512_rol_32( x1, 11 );
ty0 = mm512_rol_32( y1, 11 );
tx1 = mm512_rol_32( x3, 11 );
ty1 = mm512_rol_32( y3, 11 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
tx0 = x0;
ty0 = y0;
y7 = mm512_swap128_64( y7 );
tx1 = x2;
ty1 = y2;
x0 = mm512_rol_32( x1, 11 );
y0 = mm512_rol_32( y1, 11 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x1 = mm512_rol_32( tx0, 11 );
y1 = mm512_rol_32( ty0, 11 );
x0 = _mm512_xor_si512( x0, x4 );
x4 = mm512_swap64_32( x4 );
y0 = _mm512_xor_si512( y0, y4 );
x2 = mm512_rol_32( x3, 11 );
y4 = mm512_swap64_32( y4 );
y2 = mm512_rol_32( y3, 11 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( y0, 11 );
x3 = mm512_rol_32( x2, 11 );
y3 = mm512_rol_32( y2, 11 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( x1, x5 );
x5 = mm512_swap64_32( x5 );
y1 = _mm512_xor_si512( y1, y5 );
x3 = mm512_rol_32( tx1, 11 );
x4 = mm512_swap64_32( x4 );
y4 = mm512_swap64_32( y4 );
x5 = mm512_swap64_32( x5 );
y5 = mm512_swap64_32( y5 );
y3 = mm512_rol_32( ty1, 11 );
x2 = _mm512_xor_si512( x2, x6 );
x6 = mm512_swap64_32( x6 );
y2 = _mm512_xor_si512( y2, y6 );
y6 = mm512_swap64_32( y6 );
x2 = _mm512_xor_si512( tx1, x6 );
y2 = _mm512_xor_si512( ty1, y6 );
x3 = _mm512_xor_si512( x3, x7 );
x7 = mm512_swap64_32( x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap64_32( x6 );
y6 = mm512_swap64_32( y6 );
x7 = mm512_swap64_32( x7 );
y7 = mm512_swap64_32( y7 );
}
@@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
h[ 3] = m512_const1_128( iv[3] );
h[ 4] = m512_const1_128( iv[4] );
h[ 5] = m512_const1_128( iv[5] );
h[ 6] = m512_const1_128( iv[6] );
h[ 7] = m512_const1_128( iv[7] );
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
@@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
ROL2( x0, x1, x2, x3, 7 );
ROL2( x2, x3, y0, y1, 7 );
x0 = _mm256_xor_si256( x0, x4 );
ROL2( y0, y1, x2, x3, 7 );
ROL2( x2, x3, x0, x1, 7 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( y1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x5 = mm256_swap128_64( x5 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = _mm256_add_epi32( x0, x4 );
x6 = mm256_swap128_64( x6 );
y0 = x0;
x5 = _mm256_add_epi32( x1, x5 );
x7 = mm256_swap128_64( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
y1 = x2;
ROL2( x0, x1, x1, y0, 11 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( x2, x3, x3, y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
x4 = mm256_swap64_32( x4 );
ROL2( y0, x1, x1, x0, 11 );
ROL2( y1, x3, x3, x2, 11 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x5 = mm256_swap64_32( x5 );
x2 = _mm256_xor_si256( x2, x6 );
x6 = mm256_swap64_32( x6 );
x2 = _mm256_xor_si256( y1, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
x6 = mm256_swap64_32( x6 );
x7 = mm256_swap64_32( x7 );
}
@@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 3] = m256_const1_128( iv[3] );
h[ 4] = m256_const1_128( iv[4] );
h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m256_const1_128( iv[6] );
h[ 7] = m256_const1_128( iv[7] );
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
@@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
return 0;
}
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
{
const int len = size >> 4;

View File

@@ -15,11 +15,11 @@
struct _cubehashParam
{
__m128i _ALIGN(64) x[8]; // aligned for __m512i
int hashlen; // __m128i
int rounds;
int blocksize; // __m128i
int pos; // number of __m128i read into x from current block
__m128i _ALIGN(64) x[8]; // aligned for __m256i
};
typedef struct _cubehashParam cubehashParam;

View File

@@ -156,14 +156,12 @@ int groestl512_full( hashState_groestl* ctx, void* output,
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE512 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE512;
__m128i* in = (__m128i*)input;
@@ -175,8 +173,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -227,12 +227,10 @@ int groestl256_full( hashState_groestl256* ctx,
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT256( ctx->chaining );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
int blocks = len / SIZE256;
__m128i* in = (__m128i*)input;
@@ -245,7 +243,7 @@ int groestl256_full( hashState_groestl256* ctx,
// cryptonight has 200 byte input, an odd number of __m128i
// remainder is only 8 bytes, ie u64.
if ( databitlen % 128 !=0 )
if ( databitlen % 128 != 0 )
{
// must be cryptonight, copy 64 bits of data
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
@@ -255,8 +253,8 @@ int groestl256_full( hashState_groestl256* ctx,
{
// Copy any remaining data to buffer for final transform
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
}
//--- final ---

View File

@@ -50,7 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m512i* in = (__m512i*)input;
int i;
@@ -67,7 +66,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -76,11 +74,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---
@@ -206,7 +203,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m256i* in = (__m256i*)input;
int i;
@@ -223,7 +219,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -232,11 +227,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -99,7 +99,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
memset_zero_512( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -108,8 +107,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---
@@ -222,7 +220,6 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
memset_zero_256( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -231,8 +228,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---

View File

@@ -13,8 +13,7 @@
#if defined (ALLIUM_16WAY)
typedef struct {
blake256_16way_context blake;
typedef union {
keccak256_8way_context keccak;
cube_4way_2buf_context cube;
skein256_8way_context skein;
@@ -25,41 +24,31 @@ typedef struct {
#endif
} allium_16way_ctx_holder;
static __thread allium_16way_ctx_holder allium_16way_ctx;
bool init_allium_16way_ctx()
{
keccak256_8way_init( &allium_16way_ctx.keccak );
skein256_8way_init( &allium_16way_ctx.skein );
return true;
}
void allium_16way_hash( void *state, const void *input )
static void allium_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -69,6 +58,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhashA);
keccak256_8way_init( &ctx.keccak );
@@ -151,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashA, 32 );
skein256_8way_close( &ctx.skein, vhashA );
skein256_8way_init( &ctx.skein );
@@ -198,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
#endif
}
@@ -205,35 +197,72 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
blake256_16way_init( &allium_16way_ctx.blake );
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces, add padding.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, vdata );
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, hash+(lane<<3), mythr );
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
@@ -243,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
#elif defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
typedef union {
keccak256_4way_context keccak;
cube_2way_context cube;
skein256_4way_context skein;
@@ -255,19 +283,11 @@ typedef struct {
#endif
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_4way_init( &allium_8way_ctx.keccak );
skein256_4way_init( &allium_8way_ctx.skein );
return true;
}
void allium_8way_hash( void *hash, const void *input )
static void allium_8way_hash( void *hash, const void *midstate_vars,
const void *midhash, const void *block )
{
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
uint64_t *hash0 = (uint64_t*)hash;
uint64_t *hash1 = (uint64_t*)hash+ 4;
uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -278,15 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhashA );
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
keccak256_4way_close( &ctx.keccak, vhashA );
keccak256_4way_init( &ctx.keccak );
@@ -305,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
intrlv_2x128( vhashA, hash0, hash1, 256 );
intrlv_2x128( vhashB, hash2, hash3, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
@@ -332,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashA, 32 );
skein256_4way_close( &ctx.skein, vhashA );
skein256_4way_init( &ctx.skein );
@@ -340,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )
#if defined(__VAES__)
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -376,36 +395,72 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, vdata );
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;

View File

@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_16WAY)
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
gate->scanhash = (void*)&scanhash_allium_16way;
gate->hash = (void*)&allium_16way_hash;
#elif defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;
gate->scanhash = (void*)&scanhash_allium;

View File

@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
//void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
//void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_16WAY)
void allium_16way_hash( void *state, const void *input );
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_16way_ctx();
#elif defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#else

View File

@@ -14,38 +14,28 @@ bool lyra2z_16way_thread_init()
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -97,40 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ptarget[7] = 0x0000ff;
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
lyra2z_16way_midstate( vdata );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
@@ -145,30 +169,20 @@ bool lyra2z_8way_thread_init()
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -182,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
@@ -197,43 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
if ( bench ) ptarget[7] = 0x0000ff;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
lyra2z_8way_midstate( vdata );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_8way_hash( hash, vdata );
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
}
n += 8;
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(LYRA2Z_4WAY)

View File

@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G_2X64( s1, s3, s5, s7 ); \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 );
mm128_vrol256_64( s2, s3 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t hash4 [16] __attribute__ ((aligned (64)));
uint32_t hash5 [16] __attribute__ ((aligned (64)));
uint32_t hash6 [16] __attribute__ ((aligned (64)));
uint32_t hash7 [16] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
uint32_t hash4 [16] __attribute__ ((aligned (32)));
uint32_t hash5 [16] __attribute__ ((aligned (32)));
uint32_t hash6 [16] __attribute__ ((aligned (32)));
uint32_t hash7 [16] __attribute__ ((aligned (32)));
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
__mmask8 vh_mask;
const __m512i vmask = m512_const1_64( 24 );
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
extern void hmq1725_4way_hash(void *state, const void *input)
{
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
int h_mask;

View File

@@ -16,7 +16,8 @@
#if defined (X16R_8WAY)
// Perform midstate prehash of hash functions with block size <= 72 bytes.
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_prehash( void *vdata, void *pdata )
{
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
break;
case LUFFA:
{
hashState_luffa ctx_luffa;
mm128_bswap32_80( edata, pdata );
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
luffa_4way_init( &x16r_ctx.luffa, 512 );
luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
init_luffa( &ctx_luffa, 512 );
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
}
break;
case CUBEHASH:
{
cubehashParam ctx_cube;
mm128_bswap32_80( edata, pdata );
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
cubehashInit( &ctx_cube, 512, 16, 32 );
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
x16r_ctx.cube.rounds = ctx_cube.rounds;
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
x16r_ctx.cube.pos = ctx_cube.pos;
intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
ctx_cube.x, 1024 );
}
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t hash4[20] __attribute__ ((aligned (64)));
uint32_t hash5[20] __attribute__ ((aligned (64)));
uint32_t hash6[20] __attribute__ ((aligned (64)));
uint32_t hash7[20] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
uint32_t hash1[20] __attribute__ ((aligned (16)));
uint32_t hash2[20] __attribute__ ((aligned (16)));
uint32_t hash3[20] __attribute__ ((aligned (16)));
uint32_t hash4[20] __attribute__ ((aligned (16)));
uint32_t hash5[20] __attribute__ ((aligned (16)));
uint32_t hash6[20] __attribute__ ((aligned (16)));
uint32_t hash7[20] __attribute__ ((aligned (16)));
x16r_8way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
hashState_luffa ctx_luffa;
mm128_bswap32_80( edata, pdata );
intrlv_2x128( vdata2, edata, edata, 640 );
luffa_2way_init( &x16r_ctx.luffa, 512 );
luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
break;
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
init_luffa( &ctx_luffa, 512 );
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
ctx_luffa.buffer, 512 );
intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
ctx_luffa.chainv, 1280 );
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
}
break;
case CUBEHASH:
{
cubehashParam ctx_cube;
mm128_bswap32_80( edata, pdata );
intrlv_2x128( vdata2, edata, edata, 640 );
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
cubehashInit( &ctx_cube, 512, 16, 32 );
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
x16r_ctx.cube.rounds = ctx_cube.rounds;
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
x16r_ctx.cube.pos = ctx_cube.pos;
intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
}
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16r_4way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );

View File

@@ -30,8 +30,8 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
@@ -84,8 +84,8 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}

View File

@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
int x16rv2_8way_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
uint32_t hash0[24] __attribute__ ((aligned (32)));
uint32_t hash1[24] __attribute__ ((aligned (32)));
uint32_t hash2[24] __attribute__ ((aligned (32)));
uint32_t hash3[24] __attribute__ ((aligned (32)));
uint32_t hash4[24] __attribute__ ((aligned (32)));
uint32_t hash5[24] __attribute__ ((aligned (32)));
uint32_t hash6[24] __attribute__ ((aligned (32)));
uint32_t hash7[24] __attribute__ ((aligned (32)));
x16rv2_8way_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )
int x16rv2_4way_hash( void* output, const void* input, int thrid )
{
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t vhash[20*4] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16rv2_4way_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );

View File

@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
uint64_t hash4[8] __attribute__ ((aligned (32)));
uint64_t hash5[8] __attribute__ ((aligned (32)));
uint64_t hash6[8] __attribute__ ((aligned (32)));
uint64_t hash7[8] __attribute__ ((aligned (32)));
sonoa_8way_context_overlay ctx;
// 1
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
int sonoa_4way_hash( void *state, const void *input, int thr_id )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
sonoa_4way_context_overlay ctx;
// 1

View File

@@ -58,23 +58,27 @@ union _x17_8way_context_overlay
} __attribute__ ((aligned (64)));
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
int x17_8way_hash( void *state, const void *input, int thr_id )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
uint64_t hash4[8] __attribute__ ((aligned (32)));
uint64_t hash5[8] __attribute__ ((aligned (32)));
uint64_t hash6[8] __attribute__ ((aligned (32)));
uint64_t hash7[8] __attribute__ ((aligned (32)));
x17_8way_context_overlay ctx;
blake512_8way_full( &ctx.blake, vhash, input, 80 );
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
x17_8way_midstate );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
#if defined(__VAES__)
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
@@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
return 1;
}
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = &(hash32[7*8]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = mm512_intrlv_blend_32( *noncev,
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
do
{
if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X17_4WAY)
union _x17_4way_context_overlay
@@ -271,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
x17_4way_context_overlay ctx;
blake512_4way_full( &ctx.blake, vhash, input, 80 );

View File

@@ -3,7 +3,7 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_8WAY)
gate->scanhash = (void*)&scanhash_8way_64in_32out;
gate->scanhash = (void*)&scanhash_x17_8way;
gate->hash = (void*)&x17_8way_hash;
#elif defined (X17_4WAY)
gate->scanhash = (void*)&scanhash_4way_64in_32out;

View File

@@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_8WAY)
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x17_8way_hash( void *state, const void *input, int thr_id );
#elif defined(X17_4WAY)
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x17_4way_hash( void *state, const void *input, int thr_id );
#endif

View File

@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t hash4[16] __attribute__ ((aligned (64)));
uint64_t hash5[16] __attribute__ ((aligned (64)));
uint64_t hash6[16] __attribute__ ((aligned (64)));
uint64_t hash7[16] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (32)));
uint64_t hash1[16] __attribute__ ((aligned (32)));
uint64_t hash2[16] __attribute__ ((aligned (32)));
uint64_t hash3[16] __attribute__ ((aligned (32)));
uint64_t hash4[16] __attribute__ ((aligned (32)));
uint64_t hash5[16] __attribute__ ((aligned (32)));
uint64_t hash6[16] __attribute__ ((aligned (32)));
uint64_t hash7[16] __attribute__ ((aligned (32)));
const int dataLen = 128;
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
int xevan_4way_hash( void *output, const void *input, int thr_id )
{
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (32)));
uint64_t hash1[16] __attribute__ ((aligned (32)));
uint64_t hash2[16] __attribute__ ((aligned (32)));
uint64_t hash3[16] __attribute__ ((aligned (32)));
const int dataLen = 128;
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

View File

@@ -21,7 +21,6 @@
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
#include "algo/shavite/shavite-hash-4way.h"

View File

@@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
AVX512_OPT | VAES_OPT;
InitializeSWIFFTX();
return true;
};

View File

@@ -5,6 +5,7 @@
#include "simd-utils.h"
#include <stdint.h>
#include <unistd.h>
#include "algo/swifftx/swifftx.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X22I_8WAY 1

View File

@@ -24,7 +24,6 @@
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#include "algo/panama/panama-hash-4way.h"
#include "algo/lanehash/lane.h"
#if defined(__VAES__)
@@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay
};
typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
int x25x_8way_hash( void *output, const void *input, int thrid )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
x25x_8way_midstate );
dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
hash4[0], hash5[0], hash6[0], hash7[0], vhash );
@@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
hash4[10], hash5[10], hash6[10], hash7[10] );
#else
init_echo( &ctx.echo, 512 );
@@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hashd7 = &(hash[7*8]);
uint32_t *pdata = work->data;
@@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const bool bench = opt_benchmark;
const __m512i eight = m512_const1_64( 8 );
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = mm512_intrlv_blend_32( *noncev,
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x25x_8way_hash( hash, vdata, thr_id ) );
@@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay
panama_4way_context panama;
blake2s_4way_state blake2s;
};
typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
int x25x_4way_hash( void *output, const void *input, int thrid )
{
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
@@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_4way_full( &ctx.blake, vhash, input, 80 );
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
x25x_4way_midstate );
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
bmw512_4way_init( &ctx.bmw );
@@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *hashd7 = &(hash[ 7*4 ]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
mm256_intrlv80_4x64( vdata, edata );
*noncev = mm256_intrlv_blend_32( *noncev,
_mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
do
{
if ( x25x_4way_hash( hash, vdata, thr_id ) )
@@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
*noncev = _mm256_add_epi32( *noncev, four );
n += 4;
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -1,323 +0,0 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include <algo/yespower/crypto/sph_types.h>
#include "blake2b-yp.h"
// Cyclic right rotation.
//#ifndef ROTR64
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
//#endif
#define ROTR64(x, y) ror64( x, y )
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
// init work variables
for (i = 0; i < 8; i++) {
v[i] = ctx->h[i];
v[i + 8] = blake2b_iv[i];
}
v[12] ^= ctx->t[0]; // low 64 bits of offset
v[13] ^= ctx->t[1]; // high 64 bits
// last block flag set ?
if (last) {
v[14] = ~v[14];
}
// get little-endian words
for (i = 0; i < 16; i++) {
m[i] = B2B_GET64(&ctx->b[8 * i]);
}
// twelve rounds
for (i = 0; i < 12; i++) {
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for(i = 0; i < 8; ++i) {
ctx->h[i] ^= v[i] ^ v[i + 8];
}
}
// Initialize the hashing context "ctx" with optional key "key".
// 1 <= outlen <= 64 gives the digest size in bytes.
// Secret key (also <= 64 bytes) is optional (keylen = 0).
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
const void *key, size_t keylen) // (keylen=0: no key)
{
size_t i;
// illegal parameters
if (outlen == 0 || outlen > 64 || keylen > 64) {
return -1;
}
// state, "param block"
for (i = 0; i < 8; i++) {
ctx->h[i] = blake2b_iv[i];
}
ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
ctx->t[0] = 0; // input count low word
ctx->t[1] = 0; // input count high word
ctx->c = 0; // pointer within buffer
ctx->outlen = outlen;
// zero input block
for (i = keylen; i < 128; i++) {
ctx->b[i] = 0;
}
if (keylen > 0) {
blake2b_yp_update(ctx, key, keylen);
ctx->c = 128; // at the end
}
return 0;
}
// Add "inlen" bytes from "in" into the hash.
void blake2b_yp_update(blake2b_yp_ctx *ctx,
const void *in, size_t inlen) // data bytes
{
size_t i;
for (i = 0; i < inlen; i++) {
if (ctx->c == 128) { // buffer full ?
ctx->t[0] += ctx->c; // add counters
if (ctx->t[0] < ctx->c) // carry overflow ?
ctx->t[1]++; // high word
blake2b_compress(ctx, 0); // compress (not last)
ctx->c = 0; // counter to zero
}
ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
}
}
// Generate the message digest (size given in init).
// Result placed in "out".
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
{
size_t i;
ctx->t[0] += ctx->c; // mark last block offset
// carry overflow
if (ctx->t[0] < ctx->c) {
ctx->t[1]++; // high word
}
// fill up with zeros
while (ctx->c < 128) {
ctx->b[ctx->c++] = 0;
}
blake2b_compress(ctx, 1); // final block flag = 1
// little endian convert and store
for (i = 0; i < ctx->outlen; i++) {
((uint8_t *) out)[i] =
(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
}
}
// inlen = number of bytes
void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
blake2b_yp_ctx ctx;
blake2b_yp_init(&ctx, 32, NULL, 0);
blake2b_yp_update(&ctx, in, inlen);
blake2b_yp_final(&ctx, out);
}
// // keylen = number of bytes
void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64) {
blake2b_yp_hash(keyhash, key, keylen);
key = keyhash;
keylen = 32;
}
blake2b_yp_init(&hctx->inner, 32, NULL, 0);
memset(pad, 0x36, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->inner, pad, 64);
blake2b_yp_init(&hctx->outer, 32, NULL, 0);
memset(pad, 0x5c, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->outer, pad, 64);
memset(keyhash, 0, 32);
}
// datalen = number of bits
void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
// update the inner state
blake2b_yp_update(&hctx->inner, data, datalen);
}
void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
uint8_t ihash[32];
blake2b_yp_final(&hctx->inner, ihash);
blake2b_yp_update(&hctx->outer, ihash, 32);
blake2b_yp_final(&hctx->outer, digest);
memset(ihash, 0, 32);
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
hmac_yp_ctx hctx;
hmac_blake2b_yp_init(&hctx, key, keylen);
hmac_blake2b_yp_update(&hctx, in, inlen);
hmac_blake2b_yp_final(&hctx, out);
}
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
hmac_yp_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
hmac_blake2b_yp_update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
hmac_blake2b_yp_update(&hctx, &ivec, 4);
hmac_blake2b_yp_final(&hctx, U);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
hmac_blake2b_yp_update(&hctx, U, 32);
hmac_blake2b_yp_final(&hctx, U);
/* ... xor U_j ... */
for (k = 0; k < 32; k++) {
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32) {
clen = 32;
}
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
memset(&PShctx, 0, sizeof(hmac_yp_ctx));
}

View File

@@ -1,42 +0,0 @@
#pragma once
#ifndef __BLAKE2B_H__
#define __BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
// state context
typedef struct {
uint8_t b[128]; // input buffer
uint64_t h[8]; // chained state
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_yp_ctx;
typedef struct {
blake2b_yp_ctx inner;
blake2b_yp_ctx outer;
} hmac_yp_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
void blake2b_yp_hash(void *out, const void *in, size_t inlen);
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
#if defined(__cplusplus)
}
#endif
#endif

View File

@@ -0,0 +1,150 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "hmac-blake2b.h"
// keylen = number of bytes
void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
size_t keylen )
{
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64)
{
sph_blake2b_ctx ctx;
sph_blake2b_init( &ctx, 32, NULL, 0 );
sph_blake2b_update( &ctx, key, keylen );
sph_blake2b_final( &ctx, keyhash );
key = keyhash;
keylen = 32;
}
sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
memset( pad, 0x36, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->inner, pad, 64 );
sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
memset( pad, 0x5c, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->outer, pad, 64 );
memset( keyhash, 0, 32 );
}
// datalen = number of bits
void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
size_t datalen )
{
// update the inner state
sph_blake2b_update( &hctx->inner, data, datalen );
}
void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
{
uint8_t ihash[32];
sph_blake2b_final( &hctx->inner, ihash );
sph_blake2b_update( &hctx->outer, ihash, 32 );
sph_blake2b_final( &hctx->outer, digest );
memset( ihash, 0, 32 );
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen )
{
hmac_blake2b_ctx hctx;
hmac_blake2b_init( &hctx, key, keylen );
hmac_blake2b_update( &hctx, in, inlen );
hmac_blake2b_final( &hctx, out );
}
void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
const uint8_t *salt, size_t saltlen, uint64_t c,
uint8_t *buf, size_t dkLen )
{
hmac_blake2b_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_init( &PShctx, passwd, passwdlen );
hmac_blake2b_update( &PShctx, salt, saltlen );
/* Iterate through the blocks. */
for ( i = 0; i * 32 < dkLen; i++ )
{
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
hmac_blake2b_update( &hctx, &ivec, 4 );
hmac_blake2b_final( &hctx, U );
/* T_i = U_1 ... */
memcpy( T, U, 32 );
for ( j = 2; j <= c; j++ )
{
/* Compute U_j. */
hmac_blake2b_init( &hctx, passwd, passwdlen );
hmac_blake2b_update( &hctx, U, 32 );
hmac_blake2b_final( &hctx, U );
/* ... xor U_j ... */
for ( k = 0; k < 32; k++ )
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy( &buf[i * 32], T, clen );
}
/* Clean PShctx, since we never called _Final on it. */
memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
}

View File

@@ -0,0 +1,34 @@
#pragma once
#ifndef __HMAC_BLAKE2B_H__
#define __HMAC_BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
typedef struct
{
sph_blake2b_ctx inner;
sph_blake2b_ctx outer;
} hmac_blake2b_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen );
void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t c,
uint8_t * buf, size_t dkLen );
#if defined(__cplusplus)
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -95,7 +95,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "crypto/blake2b-yp.h"
#include "crypto/hmac-blake2b.h"
#include "yespower.h"
#ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
salsa20_blk_t *V, *XY;
pwxform_ctx_t ctx;
uint8_t init_hash[32];
sph_blake2b_ctx blake2b_ctx;
/* Sanity-check parameters */
if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
ctx.S0 = S;
ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
blake2b_yp_hash(init_hash, src, srclen);
sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
sph_blake2b_update( &blake2b_ctx, src, srclen );
sph_blake2b_final( &blake2b_ctx, init_hash );
ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
if ( work_restart[thrid].restart ) return false;
@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
/* Success! */
return 1;

View File

@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
gate->optimizations = SSE2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_yespower_b2b;
gate->hash = (void*)&yespower_b2b_hash;
opt_target_factor = 65536.0;

48
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.7.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.19.7'
PACKAGE_STRING='cpuminer-opt 3.19.7'
PACKAGE_VERSION='3.20.1'
PACKAGE_STRING='cpuminer-opt 3.20.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.19.7 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.19.7:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.19.7
cpuminer-opt configure 3.20.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.19.7, which was
It was created by cpuminer-opt $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.19.7'
VERSION='3.20.1'
cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define USE_AVX512 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.19.7, which was
This file was extended by cpuminer-opt $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.19.7
cpuminer-opt config.status 3.20.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.19.7])
AC_INIT([cpuminer-opt], [3.20.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])

View File

@@ -1099,7 +1099,7 @@ void report_summary_log( bool force )
sprintf_et( et_str, et.tv_sec );
sprintf_et( upt_str, uptime.tv_sec );
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url );
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
submit_rate, safe_div( (double)submitted_share_count*60.,
@@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work,
my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
bres, CL_N, share_time, latency );
/*
if ( unlikely( opt_debug || !result || solved ) )
{
if ( have_stratum )
@@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work,
applog2( LOG_INFO, "Diff %.5g, Block %d",
my_stats.share_diff, work ? work->height : last_block_height );
}
*/
if ( unlikely( !( opt_quiet || result || stale ) ) )
{
uint32_t str[8];
uint32_t *targ;
// uint32_t str[8];
// uint32_t *targ;
if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
{
// The exact hash is not avaiable here, it's just an imprecise
// approximation calculated from the share difficulty. It's useless
// for anything other than low diff rejects. Until and unless a
// solution is implemented to make the hash and targets avaiable
// don't bother displaying them. In the meantime display the diff for
// low diff rejects.
if ( strstr( reason, "difficulty" ) )
applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
my_stats.share_diff, my_stats.target_diff );
/*
diff_to_hash( str, my_stats.share_diff );
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work,
}
applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
*/
}
}
return 1;
}
@@ -2754,7 +2770,7 @@ static void *stratum_thread(void *userdata )
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog( LOG_BLUE, "Stratum connect %s", short_url );
applog( LOG_BLUE, "Stratum connect %s", stratum.url );
while (1)
{
@@ -3335,6 +3351,7 @@ void parse_arg(int key, char *arg )
if ( strncasecmp( arg, "http://", 7 )
&& strncasecmp( arg, "https://", 8 )
&& strncasecmp( arg, "stratum+tcp://", 14 )
&& strncasecmp( arg, "stratum+ssl://", 14 )
&& strncasecmp( arg, "stratum+tcps://", 15 ) )
{
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
@@ -3768,6 +3785,7 @@ int main(int argc, char *argv[])
flags = CURL_GLOBAL_ALL;
if ( !opt_benchmark )
if ( strncasecmp( rpc_url, "https:", 6 )
&& strncasecmp( rpc_url, "stratum+ssl://", 14 )
&& strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
flags &= ~CURL_GLOBAL_SSL;

View File

@@ -812,7 +812,7 @@ Options:\n\
lyra2z330 Lyra2 330 rows\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
minotaur Ringcoin (RNG)\n\
minotaur\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\

View File

@@ -508,6 +508,32 @@ static inline void mm128_bswap32_80( void *d, void *s )
#endif
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
{
uint32_t *s = (uint32_t*)src;
casti_m128i( d, 0 ) = _mm_set1_epi32( bswap_32( s[ 0] ) );
casti_m128i( d, 1 ) = _mm_set1_epi32( bswap_32( s[ 1] ) );
casti_m128i( d, 2 ) = _mm_set1_epi32( bswap_32( s[ 2] ) );
casti_m128i( d, 3 ) = _mm_set1_epi32( bswap_32( s[ 3] ) );
casti_m128i( d, 4 ) = _mm_set1_epi32( bswap_32( s[ 4] ) );
casti_m128i( d, 5 ) = _mm_set1_epi32( bswap_32( s[ 5] ) );
casti_m128i( d, 6 ) = _mm_set1_epi32( bswap_32( s[ 6] ) );
casti_m128i( d, 7 ) = _mm_set1_epi32( bswap_32( s[ 7] ) );
casti_m128i( d, 8 ) = _mm_set1_epi32( bswap_32( s[ 8] ) );
casti_m128i( d, 9 ) = _mm_set1_epi32( bswap_32( s[ 9] ) );
casti_m128i( d,10 ) = _mm_set1_epi32( bswap_32( s[10] ) );
casti_m128i( d,11 ) = _mm_set1_epi32( bswap_32( s[11] ) );
casti_m128i( d,12 ) = _mm_set1_epi32( bswap_32( s[12] ) );
casti_m128i( d,13 ) = _mm_set1_epi32( bswap_32( s[13] ) );
casti_m128i( d,14 ) = _mm_set1_epi32( bswap_32( s[14] ) );
casti_m128i( d,15 ) = _mm_set1_epi32( bswap_32( s[15] ) );
casti_m128i( d,16 ) = _mm_set1_epi32( bswap_32( s[16] ) );
casti_m128i( d,17 ) = _mm_set1_epi32( bswap_32( s[17] ) );
casti_m128i( d,18 ) = _mm_set1_epi32( bswap_32( s[18] ) );
casti_m128i( d,19 ) = _mm_set1_epi32( bswap_32( s[19] ) );
}
/*
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
{
__m128i s0 = casti_m128i( src,0 );
@@ -561,6 +587,7 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
}
*/
// 8x32
/*
@@ -1110,6 +1137,31 @@ static inline void extr_lane_8x32( void *d, const void *s,
#if defined(__AVX2__)
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
uint32_t *s = (uint32_t*)src;
casti_m256i( d, 0 ) = _mm256_set1_epi32( bswap_32( s[ 0] ) );
casti_m256i( d, 1 ) = _mm256_set1_epi32( bswap_32( s[ 1] ) );
casti_m256i( d, 2 ) = _mm256_set1_epi32( bswap_32( s[ 2] ) );
casti_m256i( d, 3 ) = _mm256_set1_epi32( bswap_32( s[ 3] ) );
casti_m256i( d, 4 ) = _mm256_set1_epi32( bswap_32( s[ 4] ) );
casti_m256i( d, 5 ) = _mm256_set1_epi32( bswap_32( s[ 5] ) );
casti_m256i( d, 6 ) = _mm256_set1_epi32( bswap_32( s[ 6] ) );
casti_m256i( d, 7 ) = _mm256_set1_epi32( bswap_32( s[ 7] ) );
casti_m256i( d, 8 ) = _mm256_set1_epi32( bswap_32( s[ 8] ) );
casti_m256i( d, 9 ) = _mm256_set1_epi32( bswap_32( s[ 9] ) );
casti_m256i( d,10 ) = _mm256_set1_epi32( bswap_32( s[10] ) );
casti_m256i( d,11 ) = _mm256_set1_epi32( bswap_32( s[11] ) );
casti_m256i( d,12 ) = _mm256_set1_epi32( bswap_32( s[12] ) );
casti_m256i( d,13 ) = _mm256_set1_epi32( bswap_32( s[13] ) );
casti_m256i( d,14 ) = _mm256_set1_epi32( bswap_32( s[14] ) );
casti_m256i( d,15 ) = _mm256_set1_epi32( bswap_32( s[15] ) );
casti_m256i( d,16 ) = _mm256_set1_epi32( bswap_32( s[16] ) );
casti_m256i( d,17 ) = _mm256_set1_epi32( bswap_32( s[17] ) );
casti_m256i( d,18 ) = _mm256_set1_epi32( bswap_32( s[18] ) );
casti_m256i( d,19 ) = _mm256_set1_epi32( bswap_32( s[19] ) );
}
/*
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1170,6 +1222,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
casti_m128i( d,38 ) =
casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff );
}
*/
#endif // AVX2
@@ -1718,6 +1771,31 @@ static inline void extr_lane_16x32( void *d, const void *s,
#if defined(__AVX512F__) && defined(__AVX512VL__)
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
uint32_t *s = (uint32_t*)src;
casti_m512i( d, 0 ) = _mm512_set1_epi32( bswap_32( s[ 0] ) );
casti_m512i( d, 1 ) = _mm512_set1_epi32( bswap_32( s[ 1] ) );
casti_m512i( d, 2 ) = _mm512_set1_epi32( bswap_32( s[ 2] ) );
casti_m512i( d, 3 ) = _mm512_set1_epi32( bswap_32( s[ 3] ) );
casti_m512i( d, 4 ) = _mm512_set1_epi32( bswap_32( s[ 4] ) );
casti_m512i( d, 5 ) = _mm512_set1_epi32( bswap_32( s[ 5] ) );
casti_m512i( d, 6 ) = _mm512_set1_epi32( bswap_32( s[ 6] ) );
casti_m512i( d, 7 ) = _mm512_set1_epi32( bswap_32( s[ 7] ) );
casti_m512i( d, 8 ) = _mm512_set1_epi32( bswap_32( s[ 8] ) );
casti_m512i( d, 9 ) = _mm512_set1_epi32( bswap_32( s[ 9] ) );
casti_m512i( d,10 ) = _mm512_set1_epi32( bswap_32( s[10] ) );
casti_m512i( d,11 ) = _mm512_set1_epi32( bswap_32( s[11] ) );
casti_m512i( d,12 ) = _mm512_set1_epi32( bswap_32( s[12] ) );
casti_m512i( d,13 ) = _mm512_set1_epi32( bswap_32( s[13] ) );
casti_m512i( d,14 ) = _mm512_set1_epi32( bswap_32( s[14] ) );
casti_m512i( d,15 ) = _mm512_set1_epi32( bswap_32( s[15] ) );
casti_m512i( d,16 ) = _mm512_set1_epi32( bswap_32( s[16] ) );
casti_m512i( d,17 ) = _mm512_set1_epi32( bswap_32( s[17] ) );
casti_m512i( d,18 ) = _mm512_set1_epi32( bswap_32( s[18] ) );
casti_m512i( d,19 ) = _mm512_set1_epi32( bswap_32( s[19] ) );
}
/*
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1818,6 +1896,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
casti_m128i( d,78 ) =
casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff );
}
*/
#endif // AVX512
@@ -2470,6 +2549,25 @@ static inline void extr_lane_8x64( void *d, const void *s,
#if defined(__AVX512F__) && defined(__AVX512VL__)
// broadcast to all lanes
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
{
__m128i *d = (__m128i*)dst;
const __m128i *s = (const __m128i*)src;
d[ 0] = d[ 1] = d[ 2] = d[ 3] = _mm_shuffle_epi32( s[0], 0x44 );
d[ 4] = d[ 5] = d[ 6] = d[ 7] = _mm_shuffle_epi32( s[0], 0xee );
d[ 8] = d[ 9] = d[10] = d[11] = _mm_shuffle_epi32( s[1], 0x44 );
d[12] = d[13] = d[14] = d[15] = _mm_shuffle_epi32( s[1], 0xee );
d[16] = d[17] = d[18] = d[19] = _mm_shuffle_epi32( s[2], 0x44 );
d[20] = d[21] = d[22] = d[23] = _mm_shuffle_epi32( s[2], 0xee );
d[24] = d[25] = d[26] = d[27] = _mm_shuffle_epi32( s[3], 0x44 );
d[28] = d[29] = d[30] = d[31] = _mm_shuffle_epi32( s[3], 0xee );
d[32] = d[33] = d[34] = d[35] = _mm_shuffle_epi32( s[4], 0x44 );
d[36] = d[37] = d[38] = d[39] = _mm_shuffle_epi32( s[4], 0xee );
}
// byte swap and broadcast to al lanes
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -2556,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
d[10] = s0[5]; d[11] = s1[5];
d[12] = s0[6]; d[13] = s1[6];
d[14] = s0[7]; d[15] = s1[7];
if ( bit_len <= 1024 ) return;
d[16] = s0[8]; d[17] = s1[8];
d[18] = s0[9]; d[19] = s1[9];
// if ( bit_len <= 1280 ) return;
}
static inline void intrlv_2x128_512( void *dst, const void *src0,
@@ -2623,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
d[20] = s0[5]; d[21] = s1[5]; d[22] = s2[5]; d[23] = s3[5];
d[24] = s0[6]; d[25] = s1[6]; d[26] = s2[6]; d[27] = s3[6];
d[28] = s0[7]; d[29] = s1[7]; d[30] = s2[7]; d[31] = s3[7];
if ( bit_len <= 1024 ) return;
d[32] = s0[8]; d[33] = s1[8]; d[34] = s2[8]; d[35] = s3[8];
d[36] = s0[9]; d[37] = s1[9]; d[38] = s2[9]; d[39] = s3[9];
// if ( bit_len <= 1280 ) return;
}
static inline void intrlv_4x128_512( void *dst, const void *src0,

View File

@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
// Limited 2 input shuffle
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from src a, and the high half from src b.
#define mm128_shuffle2_64( a, b, c ) \
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
_mm_castsi128_pd( b ), c ) );
@@ -545,14 +546,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
// Two input shuffle-rotate.
// Concatenate v1 & v2 and rotate as one 256 bit vector.
// Continue to use vror/vrol for now to avoid confusion with
// shufl2r/shufl2l function macros available with AVX512.
// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
#if defined(__SSSE3__)
// Function macro with two inputs and one output, inputs are preserved.
// Two input functions are not available without SSSE3. Use procedure
// Function macros with two inputs and one output, inputs are preserved.
// Returns the high 128 bits, ie updated v1.
// These two-input functions are not available without SSSE3. Use procedure
// macros below instead.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
@@ -567,12 +567,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
// These macros retain the vrol/vror name for now to avoid
// confusion with the shufl2r/shuffle2l function macros above.
// These may be renamed to something like shufl2r2 for 2 nputs and
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
// with existing code. The function macros above can be used more effciently.
#define mm128_vror256_64( v1, v2 ) \
do { \

View File

@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//
// Swap bytes in vector elements, endian bswap.
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
// AVX2 version will work here. The bswap control vector is coded to work
// with both versions, bit 4 is ignored in AVX2.
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \

View File

@@ -15,13 +15,14 @@
// AVX512 intrinsics have a few changes from previous conventions.
//
// cmp instruction now returns a bitmask isnstead of a vector mask.
// cmp instruction now returns a bitmask instead of a vector mask.
// This eliminates the need for the blendv instruction.
//
// The new rotate instructions require the count to be an 8 bit
// immediate value only. Compilation fails if a variable is used.
// The documentation is the same as for shift and it works with
// variables.
// variables. The inconsistency is likely due to compiler optimizations
// that can eliminate the variable in some instances.
//
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
// usually shuffles accross all lanes.
@@ -317,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
// elements and can be called directly. But they only accept immediate 8
// for control arg.
// The workaround is a fraud, just a fluke of the compiler's optimizer.
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
// the variable control, better than rotate loops.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -429,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
//
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
//
// rename plan change ror to vror for Vector ROtate Right,
// and vrol for Vector ROtate Left, not to be confused with
//variable rotate rorv, rolv,
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
// Swap will still have its own mnemonic and will be aliased as both
// left and right shuffles.
// Shift elements right or left in 512 bit vector, filling with zeros.
// Multiple element shifts can be combined into a single larger
// element shift.
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
//
#define mm512_shiftr_256( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -529,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
// 128 bit lane shift is handled by bslli bsrli.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
@@ -583,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
// Limited 2 input, 1 output shuffle within 128 bit lanes.
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
// destination elements must come from a specific source.
#define mm512_shuffle2_64( a, b, c ) \
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
_mm512_castsi512_pd( b ), c ) );
@@ -620,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
// Drop macros? They can easilly be rebuilt using shufl2 functions
// 2 input, 1 output
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always
// the target for modification, either update by reference or by function
// return.
// Rotate concatenated { v1, v2 ) right or left and return v1.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )

View File

@@ -502,6 +502,28 @@ static inline bool has_vaes()
#endif
}
static inline bool has_vbmi()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
#endif
}
static inline bool has_vbmi2()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
#endif
}
// AMD only
static inline bool has_xop()
{

17
util.c
View File

@@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
free(sctx->url);
sctx->url = strdup(url);
}
free(sctx->curl_url);
free(sctx->curl_url);
sctx->curl_url = (char*) malloc(strlen(url));
sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
? strstr( url, "s://" )
: strstr (url, "://" ) );
// replace the stratum protocol prefix with http, https for ssl
sprintf( sctx->curl_url, "%s%s",
( strstr( url, "s://" ) || strstr( url, "ssl://" ) )
? "https" : "http", strstr( url, "://" ) );
// sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
// ? strstr( url, "s://" )
// : strstr (url, "://" ) );
if (opt_protocol)
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);