Compare commits

...

6 Commits

Author SHA1 Message Date
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
Jay D Dee
045b42babf v23.13 2023-11-21 14:18:15 -05:00
Jay D Dee
fc696dbbe5 v23.12 2023-11-20 11:51:57 -05:00
Jay D Dee
f3fde95f27 v23.10 2023-11-15 11:05:41 -05:00
Jay D Dee
0a78013cbe v23.9 2023-11-12 18:48:50 -05:00
97 changed files with 3946 additions and 2790 deletions

View File

@@ -250,6 +250,7 @@ cpuminer_SOURCES = \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x20r.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x16/minotaur.c \

View File

@@ -87,7 +87,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
- Arm CPU supporting AArch64 and NEON.
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
are not supported. FreeBSD YMMV.
32 bit CPUs are not supported.
ARM requirements (Beta):
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
CPU: Armv8 and NEON, SHA2 & AES are optional
OS: Linux distribution built for AArch64.
Packages: source code only.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
If a mobile CPU can mine it any CPU can.
See wiki for details.
@@ -73,6 +75,47 @@ If not what makes it happen or not happen?
Change Log
----------
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.
Eliminated redundant hash order calculations for x16r family.
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
v23.11
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
v23.10
x86_64: Fixed scrypt, scryptn2 algos SSE2.
Fixed sha512256d algo AVX2, SSE2, NEON.
Fixed a bug in Skein N-way that reduced performance.
ARM: Skein optimized for NEON, SHA2 & SSE2.
Skein2 algo 2-way optimized for NEON & SSE2.
v23.9
x86_64: fixed minotaurx crash, broken in 23.7.
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
v23.8
Cpuminer-opt is no longer dependant on OpenSSL.

View File

@@ -368,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
case ALGO_X17: rc = register_x17_algo ( gate ); break;
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;

View File

@@ -99,7 +99,7 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
// AVX10 does not have explicit algo features:

View File

@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );

View File

@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else

View File

@@ -475,11 +475,12 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr;
size_t ptr, k;
unsigned bit_len;
uint64_t th, tl;
ptr = sc->ptr;
memcpy( buf, sc->buf, ptr );
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
@@ -519,7 +520,8 @@ void blake512_close( blake512_context *sc, void *dst )
blake512_update( sc, buf, 128 );
}
v128_block_bswap64_512( dst, sc->H );
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,

View File

@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -131,47 +131,7 @@
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
v128_t *V = (v128_t*)v; \
v128_t V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))

View File

@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
*/
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm128_rol_32( (x), 4), \
mm128_rol_32( (x), 19) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 3) ), \
v128_xor( v128_rol32( (x), 4), \
v128_rol32( (x), 19) ) )
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 8), \
mm128_rol_32( (x), 23) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 8), \
v128_rol32( (x), 23) ) )
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm128_rol_32( (x), 12), \
mm128_rol_32( (x), 25) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 1) ), \
v128_xor( v128_rol32( (x), 12), \
v128_rol32( (x), 25) ) )
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 15), \
mm128_rol_32( (x), 29) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 15), \
v128_rol32( (x), 29) ) )
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
v128_xor( (x), v128_sr32( (x), 1 ) )
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
v128_xor( (x), v128_sr32( (x), 2 ) )
#define rs1(x) mm128_rol_32( x, 3 )
#define rs2(x) mm128_rol_32( x, 7 )
#define rs3(x) mm128_rol_32( x, 13 )
#define rs4(x) mm128_rol_32( x, 16 )
#define rs5(x) mm128_rol_32( x, 19 )
#define rs6(x) mm128_rol_32( x, 23 )
#define rs7(x) mm128_rol_32( x, 27 )
#define rs1(x) v128_rol32( x, 3 )
#define rs2(x) v128_rol32( x, 7 )
#define rs3(x) v128_rol32( x, 13 )
#define rs4(x) v128_rol32( x, 16 )
#define rs5(x) v128_rol32( x, 19 )
#define rs6(x) v128_rol32( x, 23 )
#define rs7(x) v128_rol32( x, 27 )
#define rol_off_32( M, j, off ) \
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
v128_xor( \
v128_add32( \
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
// resulting in some sign changes compared to the reference code.
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_add32( v128_xor( M[13], H[13] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws1 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_xor( M[11], H[11] ) ), \
v128_sub32( v128_xor( M[14], H[14] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws2 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws3 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_sub32( v128_xor( M[10], H[10] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws4 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws5 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws6 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_sub32( v128_xor( M[11], H[11] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_add32( v128_xor( M[12], H[12] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws8 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[13], H[13] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws9 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws10 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws11 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 9], H[ 9] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 9], H[ 9] ) ) )
#define Ws12 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[10], H[10] ) ) )
v128_sub32( \
v128_sub32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[10], H[10] ) ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[11], H[11] ) ) )
v128_add32( \
v128_add32( \
v128_add32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_add32( v128_xor( M[10], H[10] ), \
v128_xor( M[11], H[11] ) ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[12], H[12] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[12], H[12] ) ) )
#define Ws15 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[ 4], H[4] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[13], H[13] ) ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
{
__m128i qt[32], xl, xh; \
v128u64_t qt[32], xl, xh; \
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
qt[10] = v128_add32( ss0( Ws10), H[11] );
qt[11] = v128_add32( ss1( Ws11), H[12] );
qt[12] = v128_add32( ss2( Ws12), H[13] );
qt[13] = v128_add32( ss3( Ws13), H[14] );
qt[14] = v128_add32( ss4( Ws14), H[15] );
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm_xor_si128( xl, _mm_xor_si128(
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = v128_xor( xl, v128_xor(
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
dH[ 0] = v128_add32(
v128_xor( M[0],
v128_xor( v128_sl32( xh, 5 ),
v128_sr32( qt[16], 5 ) ) ),
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
dH[ 1] = v128_add32(
v128_xor( M[1],
v128_xor( v128_sr32( xh, 7 ),
v128_sl32( qt[17], 8 ) ) ),
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
dH[ 2] = v128_add32(
v128_xor( M[2],
v128_xor( v128_sr32( xh, 5 ),
v128_sl32( qt[18], 5 ) ) ),
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
dH[ 3] = v128_add32(
v128_xor( M[3],
v128_xor( v128_sr32( xh, 1 ),
v128_sl32( qt[19], 5 ) ) ),
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
dH[ 4] = v128_add32(
v128_xor( M[4],
v128_xor( v128_sr32( xh, 3 ),
v128_sl32( qt[20], 0 ) ) ),
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
dH[ 5] = v128_add32(
v128_xor( M[5],
v128_xor( v128_sl32( xh, 6 ),
v128_sr32( qt[21], 6 ) ) ),
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
dH[ 6] = v128_add32(
v128_xor( M[6],
v128_xor( v128_sr32( xh, 4 ),
v128_sl32( qt[22], 6 ) ) ),
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
dH[ 7] = v128_add32(
v128_xor( M[7],
v128_xor( v128_sr32( xh, 11 ),
v128_sl32( qt[23], 2 ) ) ),
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
dH[ 8] = v128_add32( v128_add32(
v128_rol32( dH[4], 9 ),
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
v128_xor( v128_sl32( xl, 8 ),
v128_xor( qt[23], qt[ 8] ) ) );
dH[ 9] = v128_add32( v128_add32(
v128_rol32( dH[5], 10 ),
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
v128_xor( v128_sr32( xl, 6 ),
v128_xor( qt[16], qt[ 9] ) ) );
dH[10] = v128_add32( v128_add32(
v128_rol32( dH[6], 11 ),
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
v128_xor( v128_sl32( xl, 6 ),
v128_xor( qt[17], qt[10] ) ) );
dH[11] = v128_add32( v128_add32(
v128_rol32( dH[7], 12 ),
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
v128_xor( v128_sl32( xl, 4 ),
v128_xor( qt[18], qt[11] ) ) );
dH[12] = v128_add32( v128_add32(
v128_rol32( dH[0], 13 ),
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
v128_xor( v128_sr32( xl, 3 ),
v128_xor( qt[19], qt[12] ) ) );
dH[13] = v128_add32( v128_add32(
v128_rol32( dH[1], 14 ),
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
v128_xor( v128_sr32( xl, 4 ),
v128_xor( qt[20], qt[13] ) ) );
dH[14] = v128_add32( v128_add32(
v128_rol32( dH[2], 15 ),
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
v128_xor( v128_sr32( xl, 7 ),
v128_xor( qt[21], qt[14] ) ) );
dH[15] = v128_add32( v128_add32(
v128_rol32( dH[3], 16 ),
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
v128_xor( v128_sr32( xl, 2 ),
v128_xor( qt[22], qt[15] ) ) );
}
static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const __m128i final_s[16] =
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = _mm_set1_epi32( iv[i] );
// sc->H[i] = v128_32( iv[i] );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
v128u64_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
v128_memcpy( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
__m128i *buf;
__m128i h1[16], h2[16], *h;
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = m128_zero;
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = v128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
compress_small( buf, (v128u64_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
casti_v128( dst, u ) = h1[v];
}
/*

View File

@@ -2,12 +2,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
#if defined(BMW512_8WAY)
void bmw512hash_8way(void *state, const void *input)
void bmw512hash_8way( void *state, const void *input )
{
bmw512_8way_context ctx;
bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
const int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
#elif defined(BMW512_4WAY)
//#ifdef BMW512_4WAY
void bmw512hash_4way(void *state, const void *input)
void bmw512hash_4way( void *state, const void *input )
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input )
{
bmw512_2x64_context ctx;
bmw512_2x64_init( &ctx );
bmw512_2x64_update( &ctx, input, 80 );
bmw512_2x64_close( &ctx, state );
}
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
v128_bswap32_intrlv80_2x64( vdata, pdata );
do {
*noncev = v128_intrlv_blend_32( v128_bswap32(
v128_set32( n+1, 0, n, 0 ) ), *noncev );
bmw512hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -2,7 +2,7 @@
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 256.0;
#if defined (BMW512_8WAY)
gate->scanhash = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
#elif defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#elif defined (BMW512_2WAY)
gate->scanhash = (void*)&scanhash_bmw512_2x64;
gate->hash = (void*)&bmw512hash_2x64;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;

View File

@@ -8,19 +8,27 @@
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define BMW512_2WAY 1
#endif
#if defined(BMW512_8WAY)
void bmw512hash_8way( void *state, const void *input );
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input );
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else

View File

@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
}
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
{
int i, j;
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
return SUCCESS;
}
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
HashReturn update_echo( hashState_echo *state, const void *data,
uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
return SUCCESS;
}
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
HashReturn final_echo( hashState_echo *state, void *hashval)
{
v128_t remainingbits;
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
return SUCCESS;
}
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
return SUCCESS;
}
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength datalen )
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t datalen )
{
int i, j;
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
data, state->uBlockLength - state->uBufferBytes );
// Process buffer
Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
if( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
memcpy(state->buffer, data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
#if 0
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif
#endif

View File

@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
HashReturn reinit_echo(hashState_echo *state);
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
HashReturn final_echo(hashState_echo *state, void *hashval);
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength databitlen );
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen );
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t databitlen );
#endif // HASH_API_H

View File

@@ -36,7 +36,6 @@
#include "sph_echo.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif
#endif // !AES

View File

@@ -36,8 +36,6 @@
#ifndef SPH_ECHO_H__
#define SPH_ECHO_H__
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
#ifdef __cplusplus
}
#endif
#endif // !AES
#endif

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
ctx->uBlockLength = 4;
for(i = 0; i < 6; i++)
ctx->state[i] = m128_zero;
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,21 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -301,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -329,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -434,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -444,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -551,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -580,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -35,8 +35,6 @@
#include "sph_groestl.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif // !AES
#endif

View File

@@ -42,7 +42,6 @@ extern "C"{
#include <stddef.h>
#include "compat/sph_types.h"
#if !defined(__AES__)
/**
* Output size (in bits) for Groestl-224.
*/
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
}
#endif
#endif // !AES
#endif

View File

@@ -38,7 +38,7 @@
#include <stddef.h>
#include "simd-utils.h"
// SSE2 or NEON Hamsi-512 2x64
#if defined(__SSE4_2__) || defined(__ARM_NEON)
typedef struct
{
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
size_t len );
void hamsi512_2x64( void *dst, const void *data, size_t len );
#endif
#if defined (__AVX2__)
// Hamsi-512 4x64

View File

@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_2WAY)
void keccakhash_2x64(void *state, const void *input)
{
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
keccakhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ))
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
bool register_sha3d_algo( algo_gate_t* gate )
{
hard_coded_eb = 6;
// opt_extranonce = false;
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
#if defined (KECCAK_8WAY)
#if defined (SHA3D_8WAY)
gate->scanhash = (void*)&scanhash_sha3d_8way;
gate->hash = (void*)&sha3d_hash_8way;
#elif defined (KECCAK_4WAY)
#elif defined (SHA3D_4WAY)
gate->scanhash = (void*)&scanhash_sha3d_4way;
gate->hash = (void*)&sha3d_hash_4way;
#elif defined (SHA3D_2WAY)
gate->scanhash = (void*)&scanhash_sha3d_2x64;
gate->hash = (void*)&sha3d_hash_2x64;
#else
gate->scanhash = (void*)&scanhash_sha3d;
gate->hash = (void*)&sha3d_hash;

View File

@@ -8,6 +8,16 @@
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA3D_2WAY 1
#endif
extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
void keccakhash_8way( void *state, const void *input );
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_2WAY)
void keccakhash_2x64( void *state, const void *input );
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void keccakhash( void *state, const void *input );
int scanhash_keccak( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA3D_8WAY)
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64( void *state, const void *input );
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha3d_hash( void *state, const void *input );
int scanhash_sha3d( struct work *work, uint32_t max_nonce,

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)
#if defined(SHA3D_8WAY)
void sha3d_hash_8way(void *state, const void *input)
{
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_4WAY)
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way(void *state, const void *input)
{
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, buffer );
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, buffer, 32 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
sha3d_hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -75,16 +75,16 @@
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
v128_t t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a1 = v128_xnor( a1, a0 ); \
a0 = t; \
}

View File

@@ -465,12 +465,8 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
//#if defined(__x86_64__)
skein256_2x64_context skein;
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );

View File

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif

View File

@@ -4,367 +4,273 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake1, blake2;
sph_bmw512_context bmw1, bmw2, bmw3;
sph_skein512_context skein1, skein2;
sph_jh512_context jh1, jh2;
sph_keccak512_context keccak1, keccak2;
hashState_luffa luffa1, luffa2;
cubehashParam cube;
sph_shavite512_context shavite1, shavite2;
#if defined(__aarch64__)
sph_simd512_context simd1, simd2;
#else
hashState_sd simd1, simd2;
#endif
sph_hamsi512_context hamsi1;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
sph_sha512_context sha1, sha2;
sph_haval256_5_context haval1, haval2;
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
hashState_fugue fugue1, fugue2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
sph_fugue512_context fugue1, fugue2;
#endif
} hmq1725_ctx_holder;
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
void init_hmq1725_ctx()
union _hmq1725_ctx_holder
{
sph_blake512_init(&hmq1725_ctx.blake1);
sph_blake512_init(&hmq1725_ctx.blake2);
sph_bmw512_init(&hmq1725_ctx.bmw1);
sph_bmw512_init(&hmq1725_ctx.bmw2);
sph_bmw512_init(&hmq1725_ctx.bmw3);
sph_skein512_init(&hmq1725_ctx.skein1);
sph_skein512_init(&hmq1725_ctx.skein2);
sph_jh512_init(&hmq1725_ctx.jh1);
sph_jh512_init(&hmq1725_ctx.jh2);
sph_keccak512_init(&hmq1725_ctx.keccak1);
sph_keccak512_init(&hmq1725_ctx.keccak2);
init_luffa( &hmq1725_ctx.luffa1, 512 );
init_luffa( &hmq1725_ctx.luffa2, 512 );
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
sph_shavite512_init(&hmq1725_ctx.shavite1);
sph_shavite512_init(&hmq1725_ctx.shavite2);
#if defined(__aarch64__)
sph_simd512_init(&hmq1725_ctx.simd1);
sph_simd512_init(&hmq1725_ctx.simd2);
#else
init_sd( &hmq1725_ctx.simd1, 512 );
init_sd( &hmq1725_ctx.simd2, 512 );
#endif
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
#if defined(__AES__)
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_init(&hmq1725_ctx.fugue1);
sph_fugue512_init(&hmq1725_ctx.fugue2);
sph_fugue512_context fugue;
#endif
sph_shabal512_init(&hmq1725_ctx.shabal1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
sph_sha512_init( &hmq1725_ctx.sha1 );
sph_sha512_init( &hmq1725_ctx.sha2 );
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
}
void hmq_bmw512_midstate( const void* input )
{
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
sph_bmw512( &hmq_bmw_mid, input, 64 );
}
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha;
sph_haval256_5_context haval;
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
uint32_t hashA[32] __attribute__((aligned(32)));
uint32_t hashB[32] __attribute__((aligned(32)));
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hashA ); //1
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
sph_groestl512_close( &ctx.groestl, hashA ); //2
#endif
}
else
{
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
sph_skein512_close(&h_ctx.skein1, hashA); //2
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hashB, 64 ); //1
sph_skein512_close( &ctx.skein, hashA ); //2
}
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
sph_jh512_close(&h_ctx.jh1, hashB); //4
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashA, 64 ); //3
sph_jh512_close( &ctx.jh, hashB ); //4
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
sph_keccak512_close( &ctx.keccak, hashA ); //3
if ( hashA[0] & mask ) //4
{
sph_blake512 (&h_ctx.blake1, hashA, 64); //
sph_blake512_close(&h_ctx.blake1, hashB); //5
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
}
else
{
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
sph_bmw512_close( &ctx.bmw, hashB ); //5
}
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
if ( hashB[0] & mask ) //7
{
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //
sph_keccak512_close( &ctx.keccak, hashA ); //8
}
else
{
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
sph_jh512_close(&h_ctx.jh2, hashA); //8
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashB, 64 ); //7
sph_jh512_close( &ctx.jh, hashA ); //8
}
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
sph_shavite512_close( &ctx.shavite, hashB ); //4
#if defined(__aarch64__)
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
sph_simd512_close(&h_ctx.simd1, hashA); //4
#else
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#endif
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
if ( hashA[0] & mask ) //4
{
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
else
{
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset(&hashB[8], 0, 32);
}
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashB, 64 ); //5
sph_echo512_close( &ctx.echo, hashA ); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
sph_blake512_close(&h_ctx.blake2, hashB); //7
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
if ( hashB[0] & mask ) //7
{
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashB, 64 ); //
sph_shavite512_close( &ctx.shavite, hashA ); //8
}
else
{
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
}
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
fugue512_Final( &h_ctx.fugue1, hashA ); //3
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
sph_fugue512_close( &ctx.fugue, hashA ); //3
#endif
if ( hashA[0] & mask ) //4
{
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashA, 64 ); //
sph_echo512_close( &ctx.echo, hashB ); //5
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&h_ctx.simd2, hashA, 64); //6
sph_simd512_close(&h_ctx.simd2, hashB); //7
#else
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
sph_shabal512_close( &ctx.shabal, hashA ); //6
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
fugue512_Final( &h_ctx.fugue2, hashA ); //8
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //
sph_fugue512_close( &ctx.fugue, hashA ); //8
#endif
}
else
{
sph_sha512( &h_ctx.sha1, hashB, 64 );
sph_sha512_close( &h_ctx.sha1, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
sph_groestl512_close( &ctx.groestl, hashB ); //4
#endif
sph_sha512( &h_ctx.sha2, hashB, 64 );
sph_sha512_close( &h_ctx.sha2, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
if ( hashA[0] & mask ) //4
{
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
memset(&hashB[8], 0, 32);
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset( &hashB[8], 0, 32 );
}
else
{
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
sph_bmw512_close( &ctx.bmw, hashA ); //6
memcpy(state, hashA, 32);
memcpy( state, hashA, 32 );
}
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

View File

@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,13 +8,9 @@
#include <stdio.h>
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
#ifdef __AES__
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
init_luffa(&qubit_ctx.luffa,512);
cubehashInit(&qubit_ctx.cubehash,512,16,32);
sph_shavite512_init(&qubit_ctx.shavite);
#if defined(__aarch64__)
sph_simd512_init( &qubit_ctx.simd );
#else
init_sd( &qubit_ctx.simd, 512 );
#endif
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_echo(&qubit_ctx.echo, 512);
#else
sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
#ifdef __AES__
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512 );
#else

View File

@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
c = v128_rol32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \

View File

@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; memcpy_128( W, data, 16 );
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );

View File

@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

View File

@@ -5,11 +5,11 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA512256D_8WAY 1
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#define SHA512256D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA512256D_2WAY 1
#define SHA512256D_2WAY 1
#endif
#if defined(SHA512256D_8WAY)
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, four );
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
const v128_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
bool register_sha512256d_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA512256D_8WAY)
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)

View File

@@ -34,8 +34,6 @@
#include <string.h>
#include "shabal-hash-4way.h"
//#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define DECL_STATE16 \
@@ -47,8 +45,6 @@
C8, C9, CA, CB, CC, CD, CE, CF; \
__m512i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m512i FIVE = v512_32( 5 ); \
const __m512i THREE = v512_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
mm512_swap1024_512( BF, CF ); \
} while (0)
static inline __m512i v512_mult_x3( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
}
static inline __m512i v512_mult_x5( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
}
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
_mm512_mullo_epi32( mm512_xor3( xa0, xc, \
_mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
} while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = v256_32( 5 ); \
const __m256i THREE = v256_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
mm256_swap512_256( BF, CF ); \
} while (0)
static inline __m256i v256_mult_x3( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
}
static inline __m256i v256_mult_x5( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
}
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // AVX2
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
#define DECL_STATE \
v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
const v128u32_t FIVE = v128_32( 5 ); \
const v128u32_t THREE = v128_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE( state ) \
@@ -1479,12 +1491,22 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
v128_swap256_128( BF, CF ); \
}
static inline v128_t v128_mult_x3( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 1 ) );
}
static inline v128_t v128_mult_x5( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 2 ) );
}
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
{ \
xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
v128_mul32( v128_xor3( xa0, xc, \
v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
v128_mult_x3( v128_xor3( xa0, xc, \
v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
}

View File

@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
#endif
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct {
v128_t buf[16] __attribute__ ((aligned (64)));

View File

@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
const v128_t zero = v128_zero;
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
static __thread skein512_2x64_context skein512_2x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_2x64( void *state, const void *input )
{
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
uint32_t hash0[16] __attribute__ ((aligned (32)));
uint32_t hash1[16] __attribute__ ((aligned (32)));
skein512_2x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
dintrlv_2x64( hash0, hash1, vhash64, 512 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
intrlv_2x32( state, hash0, hash1, 256 );
}
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
uint32_t hash[8*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<1]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128u32_t *noncev = (v128u32_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skeinhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
{
extr_lane_2x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -3,16 +3,20 @@
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined(SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#elif defined(SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#elif defined(SKEIN_2WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_2x64;
gate->hash = (void*)&skeinhash_2x64;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
gate->hash = (void*)&skein2hash_8way;
#elif defined (SKEIN_4WAY)
#elif defined(SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
#elif defined(SKEIN_2WAY)
gate->scanhash = (void*)&scanhash_skein2_2x64;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
return true;
};

View File

@@ -7,6 +7,8 @@
#define SKEIN_8WAY 1
#elif defined(__AVX2__)
#define SKEIN_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SKEIN_2WAY 1
#endif
#if defined(SKEIN_8WAY)
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#elif defined(SKEIN_2WAY)
void skeinhash_2x64( void *output, const void *input );
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void skein2hash_2x64( void *output, const void *input );
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#else
void skeinhash( void *output, const void *input );

View File

@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
}
memset_zero_512( buf, buf_size >> 3 );
bcount = 0;
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
}
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
// Close
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
if ( ptr )
{
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
}
v128_memset_zero( buf, buf_size >> 3 );
bcount = 0;

View File

@@ -5,19 +5,6 @@
#if defined(SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx;
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
#elif defined(SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
uint64_t hash[16*4] __attribute__ ((aligned (64)));
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
skein512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
skein512_2x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,369 +0,0 @@
#include "Swifftx_sha3.h"
extern "C" {
#include "SWIFFTX.h"
}
#include <math.h>
#include <stdlib.h>
#include <string.h>
// The default salt value.
// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
// 8281828459045235360.
// The above in base 256, from MSB to LSB:
BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
// All the IVs here below were produced from the decimal digits of e's expansion.
// The code can be found in 'ProduceRandomIV.c'.
// The initial value for 224 digest size.
const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{37, 242, 132, 2, 167, 81, 158, 237, 113, 77, 162, 60, 65, 236, 108, 246,
101, 72, 190, 109, 58, 205, 99, 6, 114, 169, 104, 114, 38, 146, 121, 142,
59, 98, 233, 84, 72, 227, 22, 199, 17, 102, 198, 145, 24, 178, 37, 1,
215, 245, 66, 120, 230, 193, 113, 253, 165, 218, 66, 134, 49, 231, 124, 204,
0};
// The initial value for 256 digest size.
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{250, 50, 42, 40, 14, 233, 53, 48, 227, 42, 237, 187, 211, 120, 209, 234,
27, 144, 4, 61, 243, 244, 29, 247, 37, 162, 70, 11, 231, 196, 53, 6,
193, 240, 94, 126, 204, 132, 104, 46, 114, 29, 3, 104, 118, 184, 201, 3,
57, 77, 91, 101, 31, 155, 84, 199, 228, 39, 198, 42, 248, 198, 201, 178,
8};
// The initial value for 384 digest size.
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{40, 145, 193, 100, 205, 171, 47, 76, 254, 10, 196, 41, 165, 207, 200, 79,
109, 13, 75, 201, 17, 172, 64, 162, 217, 22, 88, 39, 51, 30, 220, 151,
133, 73, 216, 233, 184, 203, 77, 0, 248, 13, 28, 199, 30, 147, 232, 242,
227, 124, 169, 174, 14, 45, 27, 87, 254, 73, 68, 136, 135, 159, 83, 152,
0};
// The initial value for 512 digest size.
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{195, 126, 197, 167, 157, 114, 99, 126, 208, 105, 200, 90, 71, 195, 144, 138,
142, 122, 123, 116, 24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138, 42,
114, 118, 132, 33, 35, 149, 143, 163, 163, 183, 243, 175, 72, 22, 201, 255,
102, 243, 22, 187, 211, 167, 239, 76, 164, 70, 80, 182, 181, 212, 9, 185,
0};
///////////////////////////////////////////////////////////////////////////////////////////////
// NIST API implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
int Swifftx::Init(int hashbitlen)
{
switch(hashbitlen)
{
case 224:
swifftxState.hashbitlen = hashbitlen;
// Initializes h_0 in HAIFA:
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 256:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 384:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 512:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
default:
return BAD_HASHBITLEN;
}
swifftxState.wasUpdated = false;
swifftxState.remainingSize = 0;
memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// Initialize the salt with the default value.
memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
InitializeSWIFFTX();
return SUCCESS;
}
int Swifftx::Update(const BitSequence *data, DataLength databitlen)
{
// The size of input in bytes after putting the remaining data from previous invocation.
int sizeOfInputAfterRemaining = 0;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// Whether we handled a single block.
bool wasSingleBlockHandled = false;
swifftxState.wasUpdated = true;
// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
// (but of course uses the output of the compression function from the previous round,
// which is called h_{i-1} in HAIFA article), we have to do nothing here.
if (databitlen == 0)
return SUCCESS;
// If we had before an input with unaligned length, return an error
if (swifftxState.remainingSize % 8)
{
return INPUT_DATA_NOT_ALIGNED;
}
// Convert remaining size to bytes.
swifftxState.remainingSize /= 8;
// As long as we have enough data combined from (remaining + data) to fill input block
//NASTAVENIE RUND
while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
{
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2. The input part of the block:
// 2a. The remaining data from the previous 'Update()' call:
if (swifftxState.remainingSize)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
swifftxState.remainingSize);
// 2b. The input data that we have place for after the 'remaining':
sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE
- SWIF_HAIFA_SALT_SIZE;
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize,
data, sizeOfInputAfterRemaining);
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
wasSingleBlockHandled = true;
data += sizeOfInputAfterRemaining;
databitlen -= (sizeOfInputAfterRemaining * 8);
swifftxState.remainingSize = 0;
}
// Update the swifftxState.remaining and swifftxState.remainingSize.
// remainingSize will be in bits after exiting 'Update()'.
if (wasSingleBlockHandled)
{
swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
if (swifftxState.remainingSize)
memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
}
else
{
memcpy(swifftxState.remaining + swifftxState.remainingSize, data,
(size_t) (databitlen + 7) / 8);
swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
}
return SUCCESS;
}
int Swifftx::Final(BitSequence *hashval)
{
int i;
// Whether to add one last block. True if the padding appended to the last block overflows
// the block size.
bool toAddFinalBlock = false;
bool toPutOneInFinalBlock = false;
unsigned short oneShift = 0;
// The size of the last input block before the zeroes padding. We add 1 here because we
// include the final '1' bit in the calculation and 7 as we round the length to bytes.
unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
// The number of bytes of zero in the padding part.
// The padding contains:
// 1. A single 1 bit.
// 2. As many zeroes as needed.
// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
// 4. The digest size. Maximum is 512, so we need 2 bytes.
// If the total number achieved is negative, add an additional block, as HAIFA specifies.
short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2
- SWIF_HAIFA_SALT_SIZE;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// The message length in base 256.
BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
// The digest size used for padding:
unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
if (numOfZeroBytesInPadding < 1)
toAddFinalBlock = true;
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
// call, if exists and an extra '1' bit (maybe all we have is this extra 1):
// Add the last 1 in big-endian convention ...
if (swifftxState.remainingSize % 8 == 0)
{
swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
}
else
{
swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
}
if (sizeOfLastInputBlock)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
sizeOfLastInputBlock);
// Compute the message length in base 256:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
messageLengthChar[i] = swifftxState.numOfBitsChar[i];
if (sizeOfLastInputBlock)
AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
if (!toAddFinalBlock)
{
// 2b. Put the zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, numOfZeroBytesInPadding);
// 2c. Pad the message length:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + i] = messageLengthChar[i];
// 2d. Pad the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
}
else
{
// 2b. Put the zeroes, if at all:
if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
{
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
}
}
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock);
// If we have to add one more block, it is now:
if (toAddFinalBlock)
{
// 1. The previous output block, as usual.
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. Instead of the input, zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0,
SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
// 2b. Instead of the input, the message length:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
- SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
messageLengthChar,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 2c. Instead of the input, the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
// 3. The #bits part of the block, which is zero in case of additional block:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
0,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true);
}
// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
// first hashbitlen of them:
for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
hashval[i] = swifftxState.currOutputBlock[i];
return SUCCESS;
}
int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval)
{
int result;
//hashState state;
// The pointer to the current place in the input we take into the compression function.
DataLength currInputIndex = 0;
result = Swifftx::Init(hashbitlen);
if (result != SUCCESS)
return result;
for ( ; (databitlen / 8) > SWIF_HAIFA_INPUT_BLOCK_SIZE;
currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
{
result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
if (result != SUCCESS)
return result;
}
// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
result = Swifftx::Update(data + currInputIndex, databitlen);
if (result != SUCCESS)
{
return result;
}
return Swifftx::Final(hashval);
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Helper fuction implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE],
unsigned short toAdd)
{
unsigned char remainder = 0;
short i;
BitSequence currValueInBase256[8] = {0};
unsigned short currIndex = 7;
unsigned short temp = 0;
do
{
remainder = toAdd % 256;
currValueInBase256[currIndex--] = remainder;
toAdd -= remainder;
toAdd /= 256;
}
while(toAdd != 0);
for (i = 7; i >= 0; --i)
{
temp = value[i] + currValueInBase256[i];
if (temp > 255)
{
value[i] = temp % 256;
currValueInBase256[i - 1]++;
}
else
value[i] = (unsigned char) temp;
}
}

View File

@@ -1,79 +0,0 @@
#ifndef SWIFFTX_SHA3_H
#define SWIFFTX_SHA3_H
#include "sha3_interface.h"
#include "stdbool.h"
#include "stdint.h"
class Swifftx : public SHA3 {
#define SWIFFTX_INPUT_BLOCK_SIZE 256
#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
#define SWIF_HAIFA_SALT_SIZE 8
#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
- SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
typedef unsigned char BitSequence;
//const DataLength SWIF_SALT_VALUE;
#define SWIF_HAIFA_IV 0
/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
typedef enum
{
SUCCESS = 0,
FAIL = 1,
BAD_HASHBITLEN = 2,
BAD_SALT_SIZE = 3,
SET_SALT_VALUE_FAILED = 4,
INPUT_DATA_NOT_ALIGNED = 5
} HashReturn;
typedef struct hashState {
unsigned short hashbitlen;
// The data remained after the recent call to 'Update()'.
BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
// The size of the remaining data in bits.
// Is 0 in case there is no remaning data at all.
unsigned int remainingSize;
// The current output of the compression function. At the end will contain the final digest
// (which may be needed to be truncated, depending on hashbitlen).
BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
// The value of '#bits hashed so far' field in HAIFA, in base 256.
BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
// The salt value currently in use:
BitSequence salt[SWIF_HAIFA_SALT_SIZE];
// Indicates whether a single 'Update()' occured.
// Ater a call to 'Update()' the key and the salt values cannot be changed.
bool wasUpdated;
} hashState;
private:
int swifftxNumRounds;
hashState swifftxState;
public:
int Init(int hashbitlen);
int Update(const BitSequence *data, DataLength databitlen);
int Final(BitSequence *hashval);
int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval);
private:
static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
};
#endif

View File

@@ -1,21 +0,0 @@
#pragma once
#include <cstdint>
namespace hash {
using BitSequence = unsigned char;
using DataLength = unsigned long long;
struct hash_interface {
virtual ~hash_interface() = default;
virtual int Init(int hash_bitsize) = 0;
virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
virtual int Final(BitSequence *hash) = 0;
virtual int
Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
};
} // namespace hash

View File

@@ -1,14 +0,0 @@
#pragma once
#include <cstdint>
//#include <streams/hash/hash_interface.h>
#include "hash_interface.h"
namespace sha3 {
using BitSequence = hash::BitSequence;
using DataLength = hash::DataLength;
struct sha3_interface : hash::hash_interface {};
} // namespace sha3

View File

@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#define MULXOR \
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
/ VH_BYTE_ALIGNMENT ) + 1;
#if defined (__AVX2__)
const __m256i k = _mm256_set1_epi32( 0x1000193 );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
const v128u32_t k = v128_32( 0x1000193 );
#endif

View File

@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X11GOST_2WAY)
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
union _x11gost_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
jh512_2x64_context jh;
keccak512_2x64_context keccak;
skein512_2x64_context skein;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_gost512_context gost;
};
typedef union _x11gost_context_overlay x11gost_context_overlay;
int x11gost_2x64_hash( void *state, const void *input, int thr_id )
{
uint8_t vhash[80*2] __attribute__((aligned(64)));
uint8_t hash0[64] __attribute__((aligned(64)));
uint8_t hash1[64] __attribute__((aligned(64)));
x11gost_context_overlay ctx;
intrlv_2x64( vhash, input, input+80, 640 );
blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
bmw512_2x64_init( &ctx.bmw );
bmw512_2x64_update( &ctx.bmw, vhash, 64 );
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash0, 64 );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash1, 64 );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
intrlv_2x64( vhash, hash0, hash1, 512 );
skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash0, 64 );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash1, 64 );
sph_echo512_close( &ctx.echo, hash1 );
#endif
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
return 1;
}
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*2] __attribute__((aligned(64)));
uint32_t edata[20*2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
memcpy( edata+20, edata, 80 );
do
{
edata[19] = n;
edata[39] = n+1;
if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}
}
n += 2;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
#endif

View File

@@ -2,20 +2,24 @@
bool register_x11gost_algo( algo_gate_t* gate )
{
#if defined (X11GOST_8WAY)
#if defined(X11GOST_8WAY)
init_x11gost_8way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_8way;
gate->hash = (void*)&x11gost_8way_hash;
#elif defined (X11GOST_4WAY)
#elif defined(X11GOST_4WAY)
init_x11gost_4way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_4way;
gate->hash = (void*)&x11gost_4way_hash;
#elif defined(X11GOST_2WAY)
gate->scanhash = (void*)&scanhash_x11gost_2x64;
gate->hash = (void*)&x11gost_2x64_hash;
#else
init_x11gost_ctx();
gate->scanhash = (void*)&scanhash_x11gost;
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,6 +8,8 @@
#define X11GOST_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X11GOST_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X11GOST_2WAY 1
#endif
bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x11gost_4way_ctx();
#elif defined(X11GOST_2WAY)
int x11gost_2x64_hash( void *state, const void *input, int thr_id );
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x11gost_hash( void *state, const void *input );

View File

@@ -1,6 +1,8 @@
#include "x11gost-gate.h"
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
// no longer used, not working when last used.
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)
#include <stdlib.h>
#include <stdint.h>

View File

@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
fugue512_full( &ctx.fugue, hash1, hash1, 64 );

View File

@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
*sptr = '\0';
}
static __thread x16r_context_overlay hex_ctx;
int hex_hash( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &hex_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
}
else
{
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
{
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
simd512_ctx( &ctx.simd, hash, in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,
(const BitSequence *)in, size );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
switch ( algo )
{
case JH:
sph_jh512_init( &hex_ctx.jh );
sph_jh512( &hex_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &hex_ctx.skein );
sph_skein512( &hex_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case LUFFA:
init_luffa( &hex_ctx.luffa, 512 );
update_luffa( &hex_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &hex_ctx.cube, 512, 16, 32 );
cubehashUpdate( &hex_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &hex_ctx.hamsi );
sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
break;
case SHABAL:
sph_shabal512_init( &hex_ctx.shabal );
sph_shabal512( &hex_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &hex_ctx.whirlpool );
sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}

View File

@@ -11,29 +11,29 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/yespower/yespower.h"
//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
//#else
#else
#include "algo/echo/sph_echo.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
//#endif
#if defined(__AES__)
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/simd/nist.h"
// Config
#define MINOTAUR_ALGO_COUNT 16
@@ -47,14 +47,17 @@ typedef struct TortureGarden TortureGarden;
// Graph of hash algos plus SPH contexts
struct TortureGarden
{
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_echo512_context echo;
sph_groestl512_context groestl;
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
@@ -67,11 +70,7 @@ struct TortureGarden
cubehashParam cube;
shavite512_context shavite;
hashState_luffa luffa;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -93,9 +92,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
switch ( algo )
{
case 0:
blake512_init( &garden->blake );
blake512_update( &garden->blake, input, 64 );
blake512_close( &garden->blake, hash );
blake512_full( &garden->blake, hash, input, 64 );
break;
case 1:
sph_bmw512_init( &garden->bmw );
@@ -107,7 +104,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
cubehashUpdateDigest( &garden->cube, hash, input, 64 );
break;
case 3:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &garden->echo, hash, 512, input, 64 );
#else
sph_echo512_init( &garden->echo );
@@ -116,14 +113,14 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
#endif
break;
case 4:
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &garden->fugue, hash, input, 64 );
#else
sph_fugue512_full( &garden->fugue, hash, input, 64 );
#endif
break;
case 5:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &garden->groestl, hash, input, 512 );
#else
sph_groestl512_init( &garden->groestl) ;
@@ -165,13 +162,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
sph_shavite512_close( &garden->shavite, hash );
break;
case 13:
#if defined(__aarch64__)
sph_simd512_init( &garden->simd );
sph_simd512( &garden->simd, input, 64);
sph_simd512_close( &garden->simd, hash );
#else
simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
#endif
simd512_ctx( &garden->simd, hash, input, 64 );
break;
case 14:
sph_skein512_init( &garden->skein );

View File

@@ -19,12 +19,12 @@
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_prehash( void *vdata, void *pdata )
void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -110,7 +110,8 @@ void x16r_8way_prehash( void *vdata, void *pdata )
// Called by wrapper hash function to optionally continue hashing and
// convert to final hash.
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
int x16r_8way_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -136,9 +137,9 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -474,7 +475,8 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
int x16r_8way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8way_hash_generic( hash, input, thrid ) )
if ( !x16r_8way_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -495,7 +497,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -508,21 +509,18 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -546,12 +544,12 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
#elif defined (X16R_4WAY)
void x16r_4way_prehash( void *vdata, void *pdata )
void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -627,7 +625,8 @@ void x16r_4way_prehash( void *vdata, void *pdata )
}
}
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
int x16r_4way_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -644,9 +643,9 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -908,7 +907,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
int x16r_4way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4way_hash_generic( hash, input, thrid ) )
if ( !x16r_4way_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -924,7 +924,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -937,20 +936,18 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -971,4 +968,404 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_2WAY)
void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16r_ctx.jh );
jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
break;
case KECCAK:
v128_bswap32_intrlv80_2x64( vdata, pdata );
keccak512_2x64_init( &x16r_ctx.keccak );
keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
v128_bswap32_80( edata, pdata );
init_luffa( &x16r_ctx.luffa, 512 );
update_luffa( &x16r_ctx.luffa, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case CUBEHASH:
{
v128_bswap32_80( edata, pdata );
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16r_ctx.hamsi );
hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16r_ctx.hamsi );
sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16r_ctx.fugue );
sph_fugue512( &x16r_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16r_ctx.shabal );
sph_shabal512( &x16r_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
case WHIRLPOOL:
v128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16r_ctx.whirlpool );
sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
}
int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16r_2x64_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < func_count; i++ )
{
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, size );
}
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
}
else
{
luffa_full( &ctx.luffa, hash0, 512, hash0, size );
luffa_full( &ctx.luffa, hash1, 512, hash1, size );
}
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
if ( i == 0 )
{
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
}
else
{
sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
}
break;
case SHA_512:
sha512_2x64_init( &ctx.sha512 );
if ( i == 0 )
sha512_2x64_update( &ctx.sha512, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, size );
}
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 64 );
memcpy( output+64, hash1, 64 );
return 1;
}
int x16r_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*2] __attribute__ ((aligned (64)));
if ( !x16r_2x64_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
return 1;
}
int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128_t *noncev = (v128_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -5,18 +5,21 @@ __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
__thread x16r_8way_context_overlay x16r_ctx;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
__thread x16r_4way_context_overlay x16r_ctx;
#elif defined(X16R_2WAY)
__thread x16r_2x64_context_overlay x16r_ctx;
#endif
__thread x16r_context_overlay x16_ctx;
__thread x16r_context_overlay x16r_ref_ctx;
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
{
@@ -52,17 +55,21 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -70,17 +77,21 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16RV2_8WAY)
#if defined(X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16RV2_4WAY)
#elif defined(X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#elif defined(X16RV2_2WAY)
gate->scanhash = (void*)&scanhash_x16rv2_2x64;
gate->hash = (void*)&x16rv2_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -88,17 +99,21 @@ bool register_x16rv2_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
//
// X16RT
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
{
int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 256.0;
return true;
};
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&x16r_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 128.0;
return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X21S_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
#elif defined (X16R_4WAY)
#elif defined(X21S_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#elif defined(X21S_2WAY)
gate->scanhash = (void*)&scanhash_x21s_2x64;
gate->hash = (void*)&x21s_2x64_hash;
gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;

View File

@@ -7,13 +7,15 @@
#include <unistd.h>
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha512-hash.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#endif
#if defined (__AVX2__)
//#if defined (__AVX2__)
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#endif
//#endif
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
// X16R, X16S
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16R_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#define X16RV2_8WAY 1
#define X16RT_8WAY 1
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#define X16RT_4WAY 1
#define X21S_4WAY 1
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RV2_2WAY 1
#endif
// X16RT, VEIL
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RT_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RT_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RT_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X21S_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X21S_2WAY 1
#endif
enum x16r_Algo {
BLAKE = 0,
BMW,
@@ -134,18 +149,23 @@ union _x16r_8way_context_overlay
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
#define _x16r_8x64_context_overlay _x16r_8way_context_overlay
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
#define x16r_8x64_context_overlay x16r_8way_context_overlay
extern __thread x16r_8way_context_overlay x16r_ctx;
void x16r_8way_prehash( void *, void * );
int x16r_8way_hash_generic( void *, const void *, int );
void x16r_8way_prehash( void *, void *, const char * );
int x16r_8way_hash_generic( void *, const void *, int, const char*, const int );
int x16r_8way_hash( void *, const void *, int );
int scanhash_x16r_8way( struct work *, uint32_t ,
uint64_t *, struct thr_info * );
extern __thread x16r_8way_context_overlay x16r_ctx;
#define x16r_8x64_prehash x16r_8way_prehash
#define x16r_8x64_hash_generic x16r_8way_hash_generic
#define x16r_8x64_hash x16r_8way_hash
#define scanhash_x16r_8x64 scanhash_x16r_8x64
#elif defined(X16R_4WAY)
@@ -167,7 +187,6 @@ union _x16r_4way_context_overlay
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
hashState_luffa luffa1;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -175,46 +194,102 @@ union _x16r_4way_context_overlay
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
} __attribute__ ((aligned (64)));
#define _x16r_4x64_context_overlay _x16r_4way_context_overlay
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
#define x16r_4x64_context_overlay x16r_4way_context_overlay
extern __thread x16r_4way_context_overlay x16r_ctx;
void x16r_4way_prehash( void *, void * );
int x16r_4way_hash_generic( void *, const void *, int );
void x16r_4way_prehash( void *, void *, const char * );
int x16r_4way_hash_generic( void *, const void *, int, const char*, const int );
int x16r_4way_hash( void *, const void *, int );
int scanhash_x16r_4way( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_4way_context_overlay x16r_ctx;
#define x16r_4x64_prehash x16r_4way_prehash
#define x16r_4x64_hash_generic x16r_4way_hash_generic
#define x16r_4x64_hash x16r_4way_hash
#define scanhash_x16r_4x64 scanhash_x16r_4x64
#elif defined(X16R_2WAY)
union _x16r_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
} __attribute__ ((aligned (64)));
typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
void x16r_2x64_prehash( void *, void *, const char * );
int x16r_2x64_hash_generic( void *, const void *, int, const char*, const int );
int x16r_2x64_hash( void *, const void *, int );
int scanhash_x16r_2x64( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_2x64_context_overlay x16r_ctx;
#endif
// need a reference, add hooks for SSE2.
// needed for hex
union _x16r_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
hashState_sd simd;
sph_echo512_context echo;
#endif
sph_hamsi512_context hamsi;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha512;
@@ -222,10 +297,10 @@ union _x16r_context_overlay
typedef union _x16r_context_overlay x16r_context_overlay;
extern __thread x16r_context_overlay x16_ctx;
extern __thread x16r_context_overlay x16r_ref_ctx;
void x16r_prehash( void *, void * );
int x16r_hash_generic( void *, const void *, int );
void x16r_prehash( void *, void *, const char * );
int x16r_hash_generic( void *, const void *, int, const char*, const int );
int x16r_hash( void *, const void *, int );
int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
@@ -242,6 +317,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_2WAY)
int x16rv2_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +332,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
#endif
// x16rt, veil
#if defined(X16R_8WAY)
#if defined(X16RT_8WAY)
//void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
#elif defined(X16RT_4WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RT_2WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
//void x16rt_hash( void *state, const void *input );
@@ -272,20 +359,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
#endif
// x21s
#if defined(X16R_8WAY)
#if defined(X21S_8WAY)
int x21s_8way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X16R_4WAY)
#elif defined(X21S_4WAY)
int x21s_4way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#elif defined(X21S_2WAY)
int x21s_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_2x64_thread_init();
#else
int x21s_hash( void *state, const void *input, int thr_id );

View File

@@ -10,55 +10,60 @@
#include <stdlib.h>
#include <string.h>
void x16r_prehash( void *edata, void *pdata )
void x16r_prehash( void *edata, void *pdata, const char *hash_order )
{
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
sph_jh512_init( &x16_ctx.jh );
sph_jh512( &x16_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &x16_ctx.skein );
sph_skein512( &x16_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case KECCAK:
sph_keccak512_init( &x16r_ref_ctx.keccak );
sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
break;
case LUFFA:
init_luffa( &x16_ctx.luffa, 512 );
update_luffa( &x16_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &x16_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &x16_ctx.hamsi );
sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
break;
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
break;
case SHABAL:
sph_shabal512_init( &x16_ctx.shabal );
sph_shabal512( &x16_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &x16_ctx.whirlpool );
sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}
}
int x16r_hash_generic( void* output, const void* input, int thrid )
int x16r_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t _ALIGN(128) hash[16];
uint32_t _ALIGN(32) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &x16_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -70,36 +75,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
sph_bmw512( &ctx.bmw, in, size );
sph_bmw512_close( &ctx.bmw, hash );
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512_close( &ctx.groestl, hash );
#endif
break;
case JH:
if ( i == 0 )
sph_jh512(&ctx.jh, in+64, 16 );
sph_jh512( &ctx.jh, in+64, 16 );
else
{
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512( &ctx.jh, in, size );
}
sph_jh512_close(&ctx.jh, hash );
sph_jh512_close( &ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
if ( i == 0 )
sph_keccak512( &ctx.keccak, in+72, 8 );
else
{
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
}
sph_keccak512_close( &ctx.keccak, hash );
break;
case SKEIN:
if ( i == 0 )
sph_skein512(&ctx.skein, in+64, 16 );
sph_skein512( &ctx.skein, in+64, 16 );
else
{
sph_skein512_init( &ctx.skein );
@@ -109,13 +119,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case LUFFA:
if ( i == 0 )
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
else
luffa_full( &ctx.luffa, hash, 512, in, size );
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
cubehash_full( &ctx.cube, hash, 512, in, size );
break;
@@ -123,19 +133,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
sph_simd512( &ctx.simd, hash, size );
sph_simd512_close( &ctx.simd, hash );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence*)hash, 512,
(const BitSequence*)in, size );
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
@@ -144,7 +148,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
sph_hamsi512( &ctx.hamsi, in+64, 16 );
sph_hamsi512( &ctx.hamsi, in+72, 8 );
else
{
sph_hamsi512_init( &ctx.hamsi );
@@ -153,12 +157,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
sph_fugue512_full( &ctx.fugue, hash, in, size );
break;
case SHABAL:
if ( i == 0 )
sph_shabal512( &ctx.shabal, in+64, 16 );
@@ -197,7 +197,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
int x16r_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid ) )
if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -207,8 +208,8 @@ int x16r_hash( void* output, const void* input, int thrid )
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -230,7 +231,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -3,7 +3,7 @@
#include <stdlib.h>
#include <string.h>
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -30,12 +30,12 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -84,12 +84,12 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RT_2WAY)
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,6 +1,6 @@
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -31,7 +31,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
x16r_hash_order, swab32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
hash7, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in4 + 76, 4 );
fugue512_final( &ctx.fugue, hash4 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in5 + 76, 4 );
fugue512_final( &ctx.fugue, hash5 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in6 + 76, 4 );
fugue512_final( &ctx.fugue, hash6 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in7 + 76, 4 );
fugue512_final( &ctx.fugue, hash7 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
fugue512_full( &ctx.fugue, hash4, hash4, size );
fugue512_full( &ctx.fugue, hash5, hash5, size );
fugue512_full( &ctx.fugue, hash6, hash6, size );
fugue512_full( &ctx.fugue, hash7, hash7, size );
}
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -564,7 +593,6 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -577,19 +605,15 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -626,7 +650,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -824,8 +855,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way_update( &ctx.skein, vhash, size );
skein512_4way_close( &ctx.skein, vhash );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case LUFFA:
@@ -945,7 +976,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +987,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
}
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1055,7 +1103,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -1068,17 +1115,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1101,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
@@ -1112,7 +1157,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1202,450 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RV2_2WAY)
union _x16rv2_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
sph_tiger_context tiger;
} __attribute__ ((aligned (64)));
typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
static inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
int x16rv2_2x64_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16rv2_2x64_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = x16r_hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, 64 );
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
break;
case SHA_512:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, 64 );
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
return 1;
}
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0fff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16rv2_ctx.jh );
jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
v128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16rv2_ctx.hamsi );
hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16rv2_ctx.hamsi );
sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16rv2_ctx.fugue );
sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16rv2_ctx.shabal );
sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -6,21 +6,15 @@
*/
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)
#include "algo/tiger/sph_tiger.h"
union _x16rv2_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512(&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );

362
algo/x16/x20r.c Normal file
View File

@@ -0,0 +1,362 @@
#include "miner.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "x16r-gate.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X20R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X20R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X20R_2WAY 1
#endif
// X20R is not what it seems. It does not permute 20 functions over 20 rounds,
// it only permutes 16 of them. The last 4 functions are victims of trying to
// fit 20 elements in the space for only 16. Arithmetic overflow recycles the
// first 4 functions. Otherwise it's identical to X16R.
// Welcome to the real X20R.
#define X20R_HASH_FUNC_COUNT 20
/*
enum x20r_algo
{
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA512,
HAVAL, // Last 4 names are meaningless and not used
GOST,
RADIOGATUN,
PANAMA,
X20R_HASH_FUNC_COUNT
};
*/
static __thread char x20r_hash_order[ X20R_HASH_FUNC_COUNT + 1 ] = {0};
static void x20r_getAlgoString(const uint8_t* prevblock, char *output)
{
char *sptr = output;
for (int j = 0; j < X20R_HASH_FUNC_COUNT; j++) {
uint8_t b = (19 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
#if defined(X20R_8WAY)
int x20r_8x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
memcpy( output+64, hash+128, 32 );
memcpy( output+96, hash+192, 32 );
memcpy( output+128, hash+256, 32 );
memcpy( output+160, hash+320, 32 );
memcpy( output+192, hash+384, 32 );
memcpy( output+224, hash+448, 32 );
return 1;
}
int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_8x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x20r_8x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X20R_4WAY)
int x20r_4x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
memcpy( output+64, hash+128, 32 );
memcpy( output+96, hash+192, 32 );
return 1;
}
int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_4x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X20R_2WAY)
int x20r_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*2] __attribute__ ((aligned (64)));
if ( !x16r_2x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
return 1;
}
int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128_t *noncev = (v128_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#else
int x20r_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
return 1;
}
int scanhash_x20r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &( work_restart[thr_id].restart );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
edata[1] = bswap_32( pdata[1] );
edata[2] = bswap_32( pdata[2] );
edata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&edata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_prehash( edata, pdata, x20r_hash_order );
do
{
edata[19] = nonce;
if ( x20r_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif
bool register_x20r_algo( algo_gate_t* gate )
{
#if defined (X20R_8WAY)
gate->scanhash = (void*)&scanhash_x20r_8x64;
#elif defined (X20R_4WAY)
gate->scanhash = (void*)&scanhash_x20r_4x64;
#elif defined (X20R_2WAY)
gate->scanhash = (void*)&scanhash_x20r_2x64;
#else
gate->scanhash = (void*)&scanhash_x20r;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include "algo/haval/haval-hash-4way.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
@@ -42,7 +43,8 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
uint32_t *hash7 = (uint32_t*)( shash+448 );
x21s_8way_context_overlay ctx;
if ( !x16r_8way_hash_generic( shash, input, thrid ) )
if ( !x16r_8way_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
@@ -134,7 +136,6 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -148,20 +149,18 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -223,7 +222,8 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
uint32_t *hash2 = (uint32_t*)( shash+128 );
uint32_t *hash3 = (uint32_t*)( shash+192 );
if ( !x16r_4way_hash_generic( shash, input, thrid ) )
if ( !x16r_4way_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -307,20 +306,18 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -351,4 +348,117 @@ bool x21s_4way_thread_init()
return x21s_4way_matrix;
}
#elif defined (X21S_2WAY)
static __thread uint64_t* x21s_2x64_matrix;
union _x21s_2x64_context_overlay
{
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
} __attribute__ ((aligned (64)));
typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
int x21s_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t shash[64*2] __attribute__ ((aligned (64)));
x21s_2x64_context_overlay ctx;
uint32_t *hash0 = (uint32_t*) shash;
uint32_t *hash1 = (uint32_t*)( shash+64 );
if ( !x16r_2x64_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash0, 64 );
sph_haval256_5_close( &ctx.haval, hash0 );
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash1, 64 );
sph_haval256_5_close( &ctx.haval, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
(const void*) hash0, 32, 1, 4, 4 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
(const void*) hash1, 32, 1, 4, 4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sha256_full( output, hash0, 64 );
sha256_full( output+32, hash1, 64 );
return 1;
}
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_2x64_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_2x64_matrix = mm_malloc( size, 64 );
return x21s_2x64_matrix;
}
#endif

View File

@@ -15,7 +15,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X21S_8WAY) && !defined(X21S_4WAY)
static __thread uint64_t* x21s_matrix;
@@ -33,7 +33,8 @@ int x21s_hash( void* output, const void* input, int thrid )
uint32_t _ALIGN(128) hash[16];
x21s_context_overlay ctx;
if ( !x16r_hash_generic( hash, input, thrid ) )
if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
sph_haval256_5_init( &ctx.haval );
@@ -84,7 +85,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -928,25 +928,24 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
#elif defined(X17_2X64)
// Need sph in some cases
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/hamsi/sph_hamsi.h"
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
#include "algo/hamsi/sph_hamsi.h"
#endif
#include "algo/shabal/sph_shabal.h"
#include "algo/haval/sph-haval.h"
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
//#endif
#include "algo/fugue/sph_fugue.h"
#include "algo/fugue/sph_fugue.h"
#endif
union _x17_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
@@ -956,7 +955,7 @@ union _x17_context_overlay
#else
sph_echo512_context echo;
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
@@ -967,12 +966,8 @@ union _x17_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__x86_64__)
simd512_context simd;
#else
sph_simd512_context simd;
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
@@ -1000,7 +995,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
@@ -1033,17 +1028,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
#if defined(__x86_64__)
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#else
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash0, 64 );
sph_simd512_close( &ctx.simd, hash0 );
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash1, 64 );
sph_simd512_close( &ctx.simd, hash1 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
@@ -1057,7 +1043,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_echo512_close( &ctx.echo, hash1 );
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
intrlv_2x64( vhash, hash0, hash1, 512 );
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
@@ -1070,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_hamsi512_close( &ctx.hamsi, hash1 );
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
fugue512_full( &ctx.fugue, hash1, hash1, 64 );
#else
@@ -1142,14 +1128,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
pdata[19] = bswap_32( n );
// pdata[19] = n;
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}

View File

@@ -4,25 +4,24 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -39,14 +38,17 @@ union _x22i_context_overlay
{
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -83,10 +81,8 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_bmw512(&ctx.bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash, hash, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init(&ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, hash, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash, 64 );
@@ -141,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hash, hash, 64 );
#else
sph_fugue512_init(&ctx.fugue);
@@ -161,7 +147,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_sha512( &ctx.sha512, &hash[128], 64 );
sph_sha512_close( &ctx.sha512, &hash[192] );
ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
ComputeSingleSWIFFTX( (unsigned char*)hash, (unsigned char*)hash2 );
if ( work_restart[thrid].restart ) return 0;
@@ -176,7 +162,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_tiger_close(&ctx.tiger, (void*) hash2);
memset(hash, 0, 64);
LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
LYRA2RE( (void*)hash, 32, (const void*)hash2, 32, (const void*)hash2, 32, 1, 4, 4 );
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) hash, 64);
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
int scanhash_x22i( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];

View File

@@ -4,25 +4,24 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -42,14 +41,17 @@ union _x25x_context_overlay
{
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -88,10 +86,8 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
sph_bmw512_close(&ctx.bmw, &hash[1]);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
(const char*)&hash[1], 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
init_luffa( &ctx.luffa, 512 );
luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
sph_shavite512_close(&ctx.shavite, &hash[8]);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
sph_simd512_close(&ctx.simd, &hash[9] );
#else
update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
(const BitSequence *)&hash[8], 512 );
#endif
simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
(const BitSequence*)&hash[9], 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -146,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
sph_hamsi512_close(&ctx.hamsi, &hash[11]);
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
#else
sph_fugue512_init(&ctx.fugue);
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
int scanhash_x25x( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x25x_hash( hash64, edata, thr_id ) )
if ( x25x_hash( hash64, edata, thr_id ) );
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.8'
PACKAGE_STRING='cpuminer-opt 23.8'
PACKAGE_VERSION='23.15'
PACKAGE_STRING='cpuminer-opt 23.15'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.15:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.8
cpuminer-opt configure 23.15
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.8, which was
It was created by cpuminer-opt $as_me 23.15, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.8'
VERSION='23.15'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.8, which was
This file was extended by cpuminer-opt $as_me 23.15, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 23.8
cpuminer-opt config.status 23.15
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [23.8])
AC_INIT([cpuminer-opt], [23.15])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.8'
PACKAGE_STRING='cpuminer-opt 23.8'
PACKAGE_VERSION='23.14'
PACKAGE_STRING='cpuminer-opt 23.14'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -657,8 +657,6 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
HAVE_MACOS_FALSE
HAVE_MACOS_TRUE
MINGW_FALSE
MINGW_TRUE
ARCH_ARM_FALSE
@@ -1362,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1434,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
esac
cat <<\_ACEOF
@@ -1540,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.8
cpuminer-opt configure 23.14
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1987,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.8, which was
It was created by cpuminer-opt $as_me 23.14, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3595,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.8'
VERSION='23.14'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6881,14 +6879,6 @@ else
MINGW_FALSE=
fi
if test "x$OS" = "xAPPLE"; then
HAVE_MACOS_TRUE=
HAVE_MACOS_FALSE='#'
else
HAVE_MACOS_TRUE='#'
HAVE_MACOS_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7128,10 +7118,6 @@ if test -z "${MINGW_TRUE}" && test -z "${MINGW_FALSE}"; then
as_fn_error $? "conditional \"MINGW\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_MACOS_TRUE}" && test -z "${HAVE_MACOS_FALSE}"; then
as_fn_error $? "conditional \"HAVE_MACOS\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7522,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.8, which was
This file was extended by cpuminer-opt $as_me 23.14, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7590,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 23.8
cpuminer-opt config.status 23.14
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -2837,15 +2837,6 @@ static void show_credits()
#define check_cpu_capability() cpu_capability( false )
#define display_cpu_capability() cpu_capability( true )
#if defined(__aarch64__)
#define XSTR(x) STR(x)
#define STR(x) #x
//#pragma message "Building for armv" XSTR(__ARM_ARCH)
#endif
static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
@@ -2968,8 +2959,12 @@ static bool cpu_capability( bool display_only )
printf(" Linux\n");
#elif defined(WIN32)
printf(" Windows\n");
#elif defined(__APPLE__)
printf(" MacOS\n");
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" Unix\n");
#else
printf("\n");
printf("\n");
#endif
printf("CPU features: ");
@@ -3671,11 +3666,6 @@ static int thread_create(struct thr_info *thr, void* func)
void get_defconfig_path(char *out, size_t bufsize, char *argv0);
#include "simd-utils.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "compat/aes_helper.c"
int main(int argc, char *argv[])
{
struct thr_info *thr;

View File

@@ -3,12 +3,16 @@
#include <cpuminer-config.h>
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
#endif
#if defined(__x86_64__)
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
#elif defined(__aarch64__)
#define USER_AGENT_ARCH "arm" // AArch64
//#elif
// #define USER_AGENT_ARCH "R5" // RISC-V
// #define USER_AGENT_ARCH "r5" // RISC-V
#else
#define USER_AGENT_ARCH
#endif
@@ -668,6 +672,7 @@ enum algos {
ALGO_X16RT_VEIL,
ALGO_X16S,
ALGO_X17,
ALGO_X20R,
ALGO_X21S,
ALGO_X22I,
ALGO_X25X,
@@ -763,6 +768,7 @@ static const char* const algo_names[] = {
"x16rt-veil",
"x16s",
"x17",
"x20r",
"x21s",
"x22i",
"x25x",
@@ -926,6 +932,7 @@ Options:\n\
x16rt-veil Veil (VEIL)\n\
x16s\n\
x17\n\
x20r\n\
x21s\n\
x22i\n\
x25x\n\

View File

@@ -381,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63];
}
#endif // SSE4_1 else SSE2 or NEON
#endif // SSE4_1 or NEON else SSE2
static inline void extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
@@ -411,11 +411,11 @@ static inline void v128_bswap32_80( void *d, void *s )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf );
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -461,11 +461,11 @@ static inline void v128_bswap32_80( void *d, void *s )
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
#if defined(__SSSE3__)
@@ -480,38 +480,38 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
#else
s0 = mm128_bswap_32( s0 );
s1 = mm128_bswap_32( s1 );
s2 = mm128_bswap_32( s2 );
s3 = mm128_bswap_32( s3 );
s4 = mm128_bswap_32( s4 );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_m128i( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_m128i( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_m128i( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_m128i( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_m128i( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_m128i( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_m128i( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_m128i( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_m128i( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_m128i( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_m128i( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -797,11 +797,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
const __m256i c1 = v256_32( 0x04050607 );
const __m256i c2 = v256_32( 0x08090a0b );
const __m256i c3 = v256_32( 0x0c0d0e0f );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
@@ -855,11 +855,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1303,11 +1303,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
const __m512i c1 = v512_32( 0x04050607 );
const __m512i c2 = v512_32( 0x08090a0b );
const __m512i c3 = v512_32( 0x0c0d0e0f );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0,
_mm512_castsi128_si512( s0 ) );
@@ -1360,11 +1360,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1492,20 +1492,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
#if defined(__SSE2__)
casti_m128i( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_m128i( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_m128i( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_m128i( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_m128i( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_m128i( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_m128i( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_m128i( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_m128i( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_m128i( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
#elif defined(__ARM_NEON)
@@ -1719,7 +1719,7 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
{
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
v128_t s4 = casti_m128i( src,4 );
v128_t s4 = casti_v128( src,4 );
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
@@ -1747,11 +1747,11 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = v256_64( 0x0405060700010203 );
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
@@ -1783,7 +1783,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
v128_t s4 = casti_m128i( src,4 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
@@ -2162,11 +2162,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m512i c0 = v512_64( 0x0405060700010203 );
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0,
_mm512_castsi128_si512( s0 ) );
@@ -2197,11 +2197,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_64( 1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2391,11 +2391,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
@@ -2415,11 +2415,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2489,44 +2489,44 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
const v128_t *s = (const v128_t*)src;
v128_t *d = (v128_t*)dst;
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 2] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 6] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
if ( bit_len <= 256 ) return;
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 );
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd );
d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 );
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[10] = v128_shuffle2_32( s[10], s[11], 0x88 );
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = v128_shuffle2_32( s[12], s[13], 0xdd );
d[14] = v128_shuffle2_32( s[14], s[15], 0x88 );
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
if ( bit_len <= 512 ) return;
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd );
d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 );
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd );
d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 );
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd );
d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 );
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd );
d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 );
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = v128_shuffle2_32( s[16], s[17], 0xdd );
d[18] = v128_shuffle2_32( s[18], s[19], 0x88 );
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = v128_shuffle2_32( s[20], s[21], 0xdd );
d[22] = v128_shuffle2_32( s[22], s[23], 0x88 );
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = v128_shuffle2_32( s[24], s[25], 0xdd );
d[26] = v128_shuffle2_32( s[26], s[27], 0x88 );
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = v128_shuffle2_32( s[28], s[29], 0xdd );
d[30] = v128_shuffle2_32( s[30], s[31], 0x88 );
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
// if ( bit_len <= 1024 ) return;
}
@@ -2537,77 +2537,77 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
const v128_t *s = (const v128_t*)src;
v128_t *d = (v128_t*)dst;
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 );
d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 );
d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd );
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 2] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 6] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = v128_shuffle2_32( s[10], s[11], 0x88 );
d[10] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = v128_shuffle2_32( s[14], s[15], 0x88 );
d[14] = v128_shuffle2_32( s[12], s[13], 0xdd );
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
if ( bit_len <= 256 ) return;
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 );
d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd );
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 );
d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd );
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 );
d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd );
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 );
d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd );
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = v128_shuffle2_32( s[18], s[19], 0x88 );
d[18] = v128_shuffle2_32( s[16], s[17], 0xdd );
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = v128_shuffle2_32( s[22], s[23], 0x88 );
d[22] = v128_shuffle2_32( s[20], s[21], 0xdd );
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = v128_shuffle2_32( s[26], s[27], 0x88 );
d[26] = v128_shuffle2_32( s[24], s[25], 0xdd );
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = v128_shuffle2_32( s[30], s[31], 0x88 );
d[30] = v128_shuffle2_32( s[28], s[29], 0xdd );
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
if ( bit_len <= 512 ) return;
d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 );
d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 );
d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd );
d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd );
d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 );
d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 );
d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd );
d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd );
d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 );
d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 );
d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd );
d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd );
d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 );
d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 );
d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd );
d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd );
d[32] = v128_shuffle2_32( s[32], s[33], 0x88 );
d[33] = v128_shuffle2_32( s[34], s[35], 0x88 );
d[34] = v128_shuffle2_32( s[32], s[33], 0xdd );
d[35] = v128_shuffle2_32( s[34], s[35], 0xdd );
d[36] = v128_shuffle2_32( s[36], s[37], 0x88 );
d[37] = v128_shuffle2_32( s[38], s[39], 0x88 );
d[38] = v128_shuffle2_32( s[36], s[37], 0xdd );
d[39] = v128_shuffle2_32( s[38], s[39], 0xdd );
d[40] = v128_shuffle2_32( s[40], s[41], 0x88 );
d[41] = v128_shuffle2_32( s[42], s[43], 0x88 );
d[42] = v128_shuffle2_32( s[40], s[41], 0xdd );
d[43] = v128_shuffle2_32( s[42], s[43], 0xdd );
d[44] = v128_shuffle2_32( s[44], s[45], 0x88 );
d[45] = v128_shuffle2_32( s[46], s[47], 0x88 );
d[46] = v128_shuffle2_32( s[44], s[45], 0xdd );
d[47] = v128_shuffle2_32( s[46], s[47], 0xdd );
d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 );
d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 );
d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd );
d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd );
d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 );
d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 );
d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd );
d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd );
d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 );
d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 );
d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd );
d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd );
d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 );
d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 );
d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd );
d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd );
d[48] = v128_shuffle2_32( s[48], s[49], 0x88 );
d[49] = v128_shuffle2_32( s[50], s[51], 0x88 );
d[50] = v128_shuffle2_32( s[48], s[49], 0xdd );
d[51] = v128_shuffle2_32( s[50], s[51], 0xdd );
d[52] = v128_shuffle2_32( s[52], s[53], 0x88 );
d[53] = v128_shuffle2_32( s[54], s[55], 0x88 );
d[54] = v128_shuffle2_32( s[52], s[53], 0xdd );
d[55] = v128_shuffle2_32( s[54], s[55], 0xdd );
d[56] = v128_shuffle2_32( s[56], s[57], 0x88 );
d[57] = v128_shuffle2_32( s[58], s[59], 0x88 );
d[58] = v128_shuffle2_32( s[56], s[57], 0xdd );
d[59] = v128_shuffle2_32( s[58], s[59], 0xdd );
d[60] = v128_shuffle2_32( s[60], s[61], 0x88 );
d[61] = v128_shuffle2_32( s[62], s[63], 0x88 );
d[62] = v128_shuffle2_32( s[60], s[61], 0xdd );
d[63] = v128_shuffle2_32( s[62], s[63], 0xdd );
// if ( bit_len <= 1024 ) return;
}
@@ -3248,12 +3248,21 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
// blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] }
#if defined(__SSE4_1__)
// No SSE2 implementation.
//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
#define v128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
#define v128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
#endif // SSE4_1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define v128_intrlv_blend_64( hi, lo ) \
v128_blendv( hi, lo, v128_set64( 0ull, 0xffffffffffffffffull ) )
#define v128_intrlv_blend_32( hi, lo ) \
v128_blendv( hi, lo, v128_set64( 0xffffffffull, 0xffffffffull ) )
#else
// unknown, unsupported architecture
#endif
#if defined(__AVX2__)

View File

@@ -35,17 +35,17 @@
///////////////////////////////////////////////////////////////////////////////
// New architecturally agnostic syntax:
// All users of 128 bit SIMD should use new syntax or protect SSE2 only
// code segments.
// Other vector sizes continue with old syntax for now.
// Definitionns here will gradually be converted to new synytax.
// For consistency the larger vector utilities should do the same.
//
// __m128i -> v128_t
// _mm_ -> v128_
// mm128_ -> v128_
//
// There is also new syntax to accomodate ARM's stricter type checking of
// vector element size. They have no effect on x86_64.
// direct translation of native intrinsics
#define v128_t __m128i
// Needed for ARM
#define v128u64_t v128_t
#define v128u32_t v128_t
#define v128u16_t v128_t
@@ -56,17 +56,15 @@
// Needed for ARM, Doesn't do anything special on x86_64
#define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) )
#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) )
#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) )
#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) )
#define v128_load1_32(p) _mm_set1_epi32( *(uint32_t*)(p) )
#define v128_load1_16(p) _mm_set1_epi16( *(uint16_t*)(p) )
#define v128_load1_8( p) _mm_set1_epi8( *(uint8_t*) (p) )
// arithmetic
#define v128_add64 _mm_add_epi64
#define v128_add32 _mm_add_epi32
#define v128_add16 _mm_add_epi16
#define v128_add8 _mm_add_epi8
#define v128_add4_64 mm128_add4_64
#define v128_add4_32 mm128_add4_32
#define v128_sub64 _mm_sub_epi64
#define v128_sub32 _mm_sub_epi32
@@ -82,7 +80,7 @@
#define v128_mulw32 _mm_mul_epu32
#define v128_mulw16 _mm_mul_epu16
// compare
// signed compare
#define v128_cmpeq64 _mm_cmpeq_epi64
#define v128_cmpeq32 _mm_cmpeq_epi32
#define v128_cmpeq16 _mm_cmpeq_epi16
@@ -120,27 +118,6 @@
#define v128_xor _mm_xor_si128
#define v128_xorq _mm_xor_si128
#define v128_andnot _mm_andnot_si128
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_ornot( a, b ) mm128_or( a, mm128_not( b ) )
// ternary
#define v128_xorandnot( v2, v1, v0 ) \
_mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) \
_mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
// shift 2 concatenated vectors right
#define v128_alignr64 mm128_alignr_64
#define v128_alignr32 mm128_alignr_32
#if defined(__SSSE3__)
#define v128_alignr8 _mm_alignr_epi8
#endif
// unpack
#define v128_unpacklo64 _mm_unpacklo_epi64
@@ -230,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
#endif
// broadcast lane l to all lanes
// broadcast (replicate) lane l to all lanes
#define v128_replane64( v, l ) \
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
: _mm_shuffle_epi32( v, 0xee )
@@ -243,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
// Pseudo constants
#define v128_zero _mm_setzero_si128()
#define m128_zero _mm_setzero_si128()
#if defined(__SSE4_1__)
// Bitwise AND, return 1 if result is all bits clear.
#define v128_and_eq0 _mm_testz_si128
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
// v128_is_zero?
static inline int v128_cmpeq0( v128_t v )
{ return v128_and_eq0( v, v ); }
#endif
// Bitwise compare return 1 if all bits set.
#define v128_cmpeq1 _mm_test_all ones
#define v128_cmpeq1(v) _mm_test_all ones(v)
#define v128_one mm128_mov64_128( 1 )
#define m128_one_128 v128_one
#define v128_one mm128_mov64_128(1)
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -274,17 +249,14 @@ static inline __m128i v128_neg1_fn()
#endif
return a;
}
#define m128_neg1_fn v128_neg1_fn
#define v128_neg1 v128_neg1_fn()
#define m128_neg1 v128_neg1
//
// Vector pointer cast
// p = any aligned pointer
// returns p as pointer to vector type
#define castp_m128i(p) ((__m128i*)(p))
#define castp_v128 castp_m128i
#define castp_v128(p) ((__m128i*)(p))
#define castp_v128u64 castp_v128
#define castp_v128u32 castp_v128
#define castp_v128u16 castp_v128
@@ -292,8 +264,7 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m128i(p) (*((__m128i*)(p)))
#define cast_v128 cast_m128i
#define cast_v128(p) (*((__m128i*)(p)))
#define cast_v128u64 cast_v128
#define cast_v128u32 cast_v128
#define cast_v128u16 cast_v128
@@ -301,8 +272,8 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
#define casti_v128 casti_m128i
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
#define casti_m128i casti_v128 // deprecated
#define casti_v128u64 casti_v128
#define casti_v128u32 casti_v128
#define casti_v128u16 casti_v128
@@ -310,7 +281,7 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
#define casto_v128(p,o) (((__m128i*)(p))+(o))
#if defined(__SSE4_1__)
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
@@ -325,7 +296,7 @@ static inline __m128i v128_neg1_fn()
/////////////////////////////////////////////////////////////
//
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
// _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
//
// Fast and powerful but very limited in its application.
// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
@@ -348,7 +319,7 @@ static inline __m128i v128_neg1_fn()
// c[7:6] source element selector
// Convert type and abbreviate name: eXtract Insert Mask = XIM
#define mm128_xim_32( v1, v0, c ) \
#define v128_xim32( v1, v0, c ) \
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v0 ), c ) )
@@ -356,20 +327,19 @@ static inline __m128i v128_neg1_fn()
/*
// Copy i32 to element c of dest and copy remaining elemnts from v.
#define v128_put32( v, i32, c ) \
mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
*/
#define mm128_mask_32( v, m ) mm128_xim_32( v, v, m )
#define v128_mask32( v, m ) v128_xim32( v, v, m & 0xf )
// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
//static inline __m128i mm128_mask_32( const __m128i v, const int m )
//{ return mm128_xim_32( v, v, m ); }
#define v128_mask32 mm128_mask_32
//static inline __m128i v128_mask32( const __m128i v, const int m )
//{ return v128_xim32( v, v, m ); }
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
#define v128_movlane32( v1, l1, v0, l0 ) \
mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
#endif // SSE4_1
@@ -380,115 +350,112 @@ static inline __m128i v128_neg1_fn()
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m128i mm128_not( const __m128i v )
static inline __m128i v128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
#else
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
#define v128_not( v ) _mm_xor_si128( v, v128_neg1 )
#endif
#define v128_not mm128_not
static inline __m128i mm128_negate_64( __m128i v )
static inline v128u64_t v128_negate_64( v128u64_t v )
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
#define v128_negate64 mm128_negate_64
static inline __m128i mm128_negate_32( __m128i v )
static inline v128u32_t v128_negate_32( v128u32_t v )
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
#define v128_negate32 mm128_negate_32
static inline __m128i mm128_negate_16( __m128i v )
static inline v128u16_t v128_negate_16( v128u16_t v )
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
#define v128_negate16 mm128_negate_16
// Add 4 values, fewer dependencies than sequential addition.
#define mm128_add4_64( a, b, c, d ) \
#define v128_add4_64( a, b, c, d ) \
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
#define mm128_add4_32( a, b, c, d ) \
#define v128_add4_32( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
#define v128_add4_32 mm128_add4_32
#define mm128_add4_16( a, b, c, d ) \
#define v128_add4_16( a, b, c, d ) \
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
#define mm128_add4_8( a, b, c, d ) \
#define v128_add4_8( a, b, c, d ) \
_mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )
#define mm128_xor4( a, b, c, d ) \
#define v128_xor4( a, b, c, d ) \
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
// Memory functions
// Mostly for convenience, avoids calculating bytes.
// Assumes data is alinged and integral.
// n = number of __m128i, bytes/16
static inline void memset_zero_128( __m128i *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
#define v128_memset_zero memset_zero_128
static inline void v128_memset_zero( v128_t *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
#define memset_zero_128 v128_memset_zero
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
#define v128_memset memset_128
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v128_memcpy memcpy_128
#define memcpy_128 v128_memcpy
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
// a & b & c
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
#define v128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
#define v128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
#define v128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
#define v128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b | c )
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
#define v128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c )
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#endif
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
@@ -514,7 +481,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
// These sgould never be callled from application code, use rol/ror.
// Internal use only, should never be callled from application code.
#define v128_ror64_sse2( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -529,14 +496,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#if defined(__AVX512VL__)
// AVX512 fastest all rotations.
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
// AVX512 fastest for all rotations.
#define v128_ror64 _mm_ror_epi64
#define v128_rol64 _mm_rol_epi64
#define v128_ror32 _mm_ror_epi32
#define v128_rol32 _mm_rol_epi32
// ror/rol will alway find the fastest but these names may fit better with
// application code performing shuffles rather than bit rotations.
// ror/rol will always find the fastest but these names may fit better with
// application code performing byte operations rather than bit rotations.
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
#define v128_shuflr64_16(v) _mm_ror_epi64( v, 16 )
@@ -549,7 +516,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define v128_shufll32_16(v) _mm_rol_epi32( v, 16 )
#elif defined(__SSSE3__)
// SSE2: fastest 32 bit, very fast 16, fast 8
// SSSE3: fastest 32 bit, very fast 16, fast 8
#define v128_shuflr64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -575,7 +542,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
#define mm128_ror_64( v, c ) \
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
@@ -585,7 +552,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: v128_ror64_sse2( v, c )
#define mm128_rol_64( v, c ) \
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
@@ -595,57 +562,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: v128_rol64_sse2( v, c )
#define mm128_ror_32( v, c ) \
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? v128_lrev16( v ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: v128_ror32_sse2( v, c )
#define mm128_rol_32( v, c ) \
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? v128_lrev16( v ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: v128_rol32_sse2( v, c )
#elif defined(__SSE2__)
// SSE2: fastest 32 bit, very fast 16
// SSE2: fastest 32 bit, very fast 16, all else slow
#define mm128_ror_64( v, c ) \
#define v128_ror64( v, c ) \
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
: v128_ror64_sse2( v, c )
#define mm128_rol_64( v, c ) \
#define v128_rol64( v, c ) \
( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
: v128_rol64_sse2( v, c )
#define mm128_ror_32( v, c ) \
#define v128_ror32( v, c ) \
( (c) == 16 ) ? v128_lrev16( v ) \
: v128_ror32_sse2( v, c )
#define mm128_rol_32( v, c ) \
#define v128_rol32( v, c ) \
( (c) == 16 ) ? v128_lrev16( v ) \
: v128_rol32_sse2( v, c )
#else
#define mm128_ror_64 v128_ror64_sse2
#define mm128_rol_64 v128_rol64_sse2
#define mm128_ror_32 v128_ror32_sse2
#define mm128_rol_32 v128_rol32_sse2
#define v128_ror64 v128_ror64_sse2
#define v128_rol64 v128_rol64_sse2
#define v128_ror32 v128_ror32_sse2
#define v128_rol32 v128_rol32_sse2
#endif
// Generic names for portable code
#define v128_ror64 mm128_ror_64
#define v128_rol64 mm128_rol_64
#define v128_ror32 mm128_ror_32
#define v128_rol32 mm128_rol_32
// deprecated
#define mm128_rol_32 v128_rol32
/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
@@ -653,25 +617,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm128_rorx2_64( v1, v0, c ) \
#define v128_2ror64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define mm128_rolx2_64( v1, v0, c ) \
#define v128_2rol64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define mm128_rorx2_32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define mm128_rolx2_32( v1, v0, c ) \
#define v128_2rol32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define mm128_rorx2_64( v1, v0, c ) \
#define v128_2ror64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
@@ -681,7 +645,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_64( v1, v0, c ) \
#define v128_2rol64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
@@ -691,7 +655,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rorx2_32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
@@ -701,7 +665,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_32( v1, v0, c ) \
#define v128_2rol32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
@@ -712,20 +676,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
}
#endif // AVX512 else SSE2
#define v128_2ror64 mm128_rorx2_64
#define v128_2rol64 mm128_rolx2_64
#define v128_2ror32 mm128_rorx2_32
#define v128_2rol32 mm128_rolx2_32
*/
// Cross lane shuffles
// No NEON version
#define v128_shuffle32 _mm_shuffle_epi32
// shuffle using vector mask, for compatibility with NEON
/* Not used, exists only for compatibility with NEON if ever needed.
#define v128_shufflev32( v, vmask ) \
v128_shuffle32( v, mm128_movmask_32( vmask ) )
*/
#define v128_shuffle8 _mm_shuffle_epi8
@@ -734,12 +695,10 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define v128_shuffle2_64( v1, v2, c ) \
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
_mm_castsi128_pd( v2 ), c ) );
#define mm128_shuffle2_64 v128_shuffle2_64
#define v128_shuffle2_32( v1, v2, c ) \
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) );
#define mm128_shuffle2_32 v128_shuffle2_32
// Rotate vector elements accross all lanes
@@ -756,95 +715,77 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
//TODO fix this
// alias bswap
//#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
//#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
//#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
// reverse bits, can it be done?
//#define v128_bitrev8( v ) vrbitq_u8
/* Not used
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
*/
// Endian byte swap.
#if defined(__SSSE3__)
#define mm128_bswap_128( v ) \
#define v128_bswap128( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
0x08090a0b0c0d0e0f ) )
#define mm128_bswap_64( v ) \
#define v128_bswap64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define mm128_bswap_32( v ) \
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
// deprecated
#define mm128_bswap_32 v128_bswap32
#define mm128_bswap_16( v ) \
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_1024( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define mm128_block_bswap_32( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
@@ -852,129 +793,127 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_block_bswap32_128( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
}
#define v128_block_bswap32_512( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
#else // SSE2
static inline __m128i mm128_bswap_64( __m128i v )
static inline v128_t v128_bswap64( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
}
static inline __m128i mm128_bswap_32( __m128i v )
static inline v128_t v128_bswap32( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}
#define mm128_bswap_32 v128_bswap32
static inline __m128i mm128_bswap_16( __m128i v )
static inline v128_t v128_bswap16( __m128i v )
{
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) )
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
{
d[0] = mm128_bswap_64( s[0] );
d[1] = mm128_bswap_64( s[1] );
d[2] = mm128_bswap_64( s[2] );
d[3] = mm128_bswap_64( s[3] );
d[4] = mm128_bswap_64( s[4] );
d[5] = mm128_bswap_64( s[5] );
d[6] = mm128_bswap_64( s[6] );
d[7] = mm128_bswap_64( s[7] );
d[0] = v128_bswap64( s[0] );
d[1] = v128_bswap64( s[1] );
d[2] = v128_bswap64( s[2] );
d[3] = v128_bswap64( s[3] );
d[4] = v128_bswap64( s[4] );
d[5] = v128_bswap64( s[5] );
d[6] = v128_bswap64( s[6] );
d[7] = v128_bswap64( s[7] );
}
#define v128_block_bswap64_512 mm128_block_bswap_64
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = mm128_bswap_64( s[ 0] );
d[ 1] = mm128_bswap_64( s[ 1] );
d[ 2] = mm128_bswap_64( s[ 2] );
d[ 3] = mm128_bswap_64( s[ 3] );
d[ 4] = mm128_bswap_64( s[ 4] );
d[ 5] = mm128_bswap_64( s[ 5] );
d[ 6] = mm128_bswap_64( s[ 6] );
d[ 7] = mm128_bswap_64( s[ 7] );
d[ 8] = mm128_bswap_64( s[ 8] );
d[ 9] = mm128_bswap_64( s[ 9] );
d[10] = mm128_bswap_64( s[10] );
d[11] = mm128_bswap_64( s[11] );
d[14] = mm128_bswap_64( s[12] );
d[13] = mm128_bswap_64( s[13] );
d[14] = mm128_bswap_64( s[14] );
d[15] = mm128_bswap_64( s[15] );
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
d[ 2] = v128_bswap64( s[ 2] );
d[ 3] = v128_bswap64( s[ 3] );
d[ 4] = v128_bswap64( s[ 4] );
d[ 5] = v128_bswap64( s[ 5] );
d[ 6] = v128_bswap64( s[ 6] );
d[ 7] = v128_bswap64( s[ 7] );
d[ 8] = v128_bswap64( s[ 8] );
d[ 9] = v128_bswap64( s[ 9] );
d[10] = v128_bswap64( s[10] );
d[11] = v128_bswap64( s[11] );
d[14] = v128_bswap64( s[12] );
d[13] = v128_bswap64( s[13] );
d[14] = v128_bswap64( s[14] );
d[15] = v128_bswap64( s[15] );
}
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
{
d[0] = mm128_bswap_32( s[0] );
d[1] = mm128_bswap_32( s[1] );
d[2] = mm128_bswap_32( s[2] );
d[3] = mm128_bswap_32( s[3] );
d[4] = mm128_bswap_32( s[4] );
d[5] = mm128_bswap_32( s[5] );
d[6] = mm128_bswap_32( s[6] );
d[7] = mm128_bswap_32( s[7] );
d[0] = v128_bswap32( s[0] );
d[1] = v128_bswap32( s[1] );
d[2] = v128_bswap32( s[2] );
d[3] = v128_bswap32( s[3] );
d[4] = v128_bswap32( s[4] );
d[5] = v128_bswap32( s[5] );
d[6] = v128_bswap32( s[6] );
d[7] = v128_bswap32( s[7] );
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = mm128_bswap_32( s[ 0] );
d[ 1] = mm128_bswap_32( s[ 1] );
d[ 2] = mm128_bswap_32( s[ 2] );
d[ 3] = mm128_bswap_32( s[ 3] );
d[ 4] = mm128_bswap_32( s[ 4] );
d[ 5] = mm128_bswap_32( s[ 5] );
d[ 6] = mm128_bswap_32( s[ 6] );
d[ 7] = mm128_bswap_32( s[ 7] );
d[ 8] = mm128_bswap_32( s[ 8] );
d[ 9] = mm128_bswap_32( s[ 9] );
d[10] = mm128_bswap_32( s[10] );
d[11] = mm128_bswap_32( s[11] );
d[12] = mm128_bswap_32( s[12] );
d[13] = mm128_bswap_32( s[13] );
d[14] = mm128_bswap_32( s[14] );
d[15] = mm128_bswap_32( s[15] );
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
d[ 2] = v128_bswap32( s[ 2] );
d[ 3] = v128_bswap32( s[ 3] );
d[ 4] = v128_bswap32( s[ 4] );
d[ 5] = v128_bswap32( s[ 5] );
d[ 6] = v128_bswap32( s[ 6] );
d[ 7] = v128_bswap32( s[ 7] );
d[ 8] = v128_bswap32( s[ 8] );
d[ 9] = v128_bswap32( s[ 9] );
d[10] = v128_bswap32( s[10] );
d[11] = v128_bswap32( s[11] );
d[12] = v128_bswap32( s[12] );
d[13] = v128_bswap32( s[13] );
d[14] = v128_bswap32( s[14] );
d[15] = v128_bswap32( s[15] );
}
#endif // SSSE3 else SSE2
#define v128_bswap32 mm128_bswap_32
#define v128_bswap64 mm128_bswap_64
#define v128_bswap128 mm128_bswap_128
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
@@ -984,24 +923,20 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#if defined(__SSSE3__)
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#define v128_alignr8 _mm_alignr_epi8
#define v128_alignr64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define v128_alignr32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#else
#define mm128_alignr_64( hi, lo, c ) \
#define v128_alignr64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
#define mm128_alignr_32( hi, lo, c ) \
#define v128_alignr32( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
#endif
// NEON only uses vector mask. x86 blend selects second arg when control bit
// is set. Blendv selects second arg when sign bit is set. And masking is the
// opposite, elements are selected from the first arg if the mask bits are set.
// Arm blend is a bit by bit blend while x76 is an elenet blend.
// Reverse the logic so the use mask is consistent with both formats.
#if defined(__SSE4_1__)
#define v128_blendv _mm_blendv_epi8
@@ -1009,7 +944,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#else
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
#endif

View File

@@ -90,7 +90,7 @@ typedef union
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
#define m256_one_128 mm256_bcast_m128( v128_one )
static inline __m256i mm256_neg1_fn()
{

View File

@@ -21,36 +21,36 @@
//
// vornq( v1, v0 ) or( v1, not( v0 ) )
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
#define v128u32_t uint32x4_t
#define v128u16_t uint16x8_t
#define v128u8_t uint8x16_t
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
#define v128u32_t uint32x4_t
#define v128u16_t uint16x8_t
#define v128u8_t uint8x16_t
// load & store
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
// load & set1 combined
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
// load & set1 combined, doesn't work
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
// arithmetic
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_add4_64( v3, v2, v1, v0 ) \
vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) )
@@ -58,17 +58,17 @@
#define v128_add4_32( v3, v2, v1, v0 ) \
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// returns low half, u64 undocumented, may not exist.
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// slow, tested with argon2d
// Widening multiply, align source elements with Intel
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
{
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -76,101 +76,102 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
}
// compare
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpeq8 vceqq_u8
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpeq8 vceqq_u8
#define v128_iszero vceqzq_u64
// v128_cmp0, v128_cmpz, v128 testz
#define v128_iszero vceqzq_u64
// Not yet needed
//#define v128_cmpeq1
// Signed
#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
#define v128_cmpgt64 vcgtq_u64
#define v128_cmpgt32 vcgtq_u32
#define v128_cmpgt16 vcgtq_u16
#define v128_cmpgt8 vcgtq_u8
#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
#define v128_cmplt64 vcltq_u64
#define v128_cmplt32 vcltq_u32
#define v128_cmplt16 vcltq_u16
#define v128_cmplt8 vcltq_u8
// Logical bit shift
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sl8 vshlq_n_u8
// bit shift
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sl8 vshlq_n_u8
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sr8 vshrq_n_u8
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sr8 vshrq_n_u8
// Unit tested, working.
#define v128_sra64 vshrq_n_s64
#define v128_sra32 vshrq_n_s32
#define v128_sra16 vshrq_n_s16
// Arithmetic shift.
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c )
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c )
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c )
// unary logic
#define v128_not vmvnq_u32
#define v128_not vmvnq_u32
// binary logic
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_xor veorq_u32
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_xor veorq_u32
// ~v1 & v0
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, x86_64 convention, first arg is not'ed
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
// ternary logic
// v2 ^ v1 ^ v0
// veorq_u32 not defined
//#define v128_xor3 veor3q_u32
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
// v2 & v1 & v0
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
// v2 | v1 | v0
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
// a ^ ( ~b & c )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
// a ^ ( b & c )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
// a & ( b ^ c )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
// a ^ ( b | c )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
// v2 | ( v1 & v0 )
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
// shift 2 concatenated vectors right.
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
// Intetleave high or low half of 2 vectors.
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
// AES
@@ -184,19 +185,19 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_aesenclast( v, k ) \
v128_xor( k, vaeseq_u8( v, v128_zero ) )
#define v128_aesenclast_nokey( v, k ) \
#define v128_aesenclast_nokey( v ) \
vaeseq_u8( v, v128_zero )
#define v128_aesdec( v, k ) \
v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )
#define v128_aesdec_nokey( v, k ) \
#define v128_aesdec_nokey( v ) \
vaesimcq_u8( vaesdq_u8( v, v128_zero ) )
#define v128_aesdeclast( v, k ) \
v128_xor( k, vaesdq_u8( v, v128_zero ) )
#define v128_aesdeclast_nokey( v, k ) \
#define v128_aesdeclast_nokey( v ) \
vaesdq_u8( v, v128_zero )
@@ -254,24 +255,24 @@ typedef union
#define v128_8 vmovq_n_u8
#define v64_set32( u32_1, u32_0 ) \
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
#define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
| (uint32_t)(u16_2) ) << 32 ) \
| ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
| (uint32_t)(u16_0) ) ) )
vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \
| (uint32_t)(u16_2) ) << 32 ) \
| ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \
| (uint32_t)(u16_0) ) ) )
#define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
vcreate_u8( \
( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
| (uint16_t)(u8_6) ) << 16 ) \
| ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
| (uint16_t)(u8_4) ) )) << 32 ) \
| ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
| (uint16_t)(u8_2) ) << 16 ) \
| ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
| (uint16_t)(u8_0) ) )) ))
vcreate_u8( \
( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \
| (uint16_t)(u8_6) ) << 16 ) \
| ( (uint32_t)( ((uint16_t)(u8_5) << 8) \
| (uint16_t)(u8_4) ) ) ) << 32 ) \
| ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \
| (uint16_t)(u8_2) ) << 16 ) \
| ( (uint32_t)( ((uint16_t)(u8_1) << 8) \
| (uint16_t)(u8_0) ) ) ) ) )
#define v128_set64( u64_1, u64_0 ) \
vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) )
@@ -336,27 +337,27 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
// Bit rotation
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_rol64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_ror32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_rol32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_ror16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_rol16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_ror8( v, c ) \
@@ -405,33 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
v1 = vorrq_u32( v1, t1 ); \
}
// Cross lane shuffles, no programmable shuffle in NEON
// vector mask, use as last resort. prefer rev, alignr, etc
/* not used anywhere and hopefully never will
// vector mask, use as last resort. prefer tbl, rev, alignr, etc
#define v128_shufflev32( v, vmask ) \
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
*/
// compatible with x86_64, but very slow, avoid
#define v128_shuffle8( v, vmask ) \
v128_set8( ((uint8_t*)&v)[ ((uint8_t*)(&vmask))[15] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[14] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[13] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[12] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[11] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[10] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 9] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 8] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 7] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 6] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 5] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 4] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 3] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 2] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -549,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
}
// Prograsmmable shuffles
// no compatible shuffles with x86_64, will require targeted user code.
#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
d7, d6, d5, d4, d3, d2, d1, d0, vmask ) \
d0 = ((uint8_t*)(&vmask))[0]; d1 = ((uint8_t*)(&vmask))[1]; \
d2 = ((uint8_t*)(&vmask))[2]; d3 = ((uint8_t*)(&vmask))[3]; \
d4 = ((uint8_t*)(&vmask))[0]; d5 = ((uint8_t*)(&vmask))[1]; \
d6 = ((uint8_t*)(&vmask))[2]; d7 = ((uint8_t*)(&vmask))[3]; \
d8 = ((uint8_t*)(&vmask))[0]; d9 = ((uint8_t*)(&vmask))[1]; \
da = ((uint8_t*)(&vmask))[2]; db = ((uint8_t*)(&vmask))[3]; \
dc = ((uint8_t*)(&vmask))[0]; dd = ((uint8_t*)(&vmask))[1]; \
de = ((uint8_t*)(&vmask))[2]; df = ((uint8_t*)(&vmask))[3];
// Blendv
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

View File

@@ -930,7 +930,9 @@ static inline void cpu_brand_string( char* s )
#elif defined(__arm__) || defined(__aarch64__)
sprintf( s, "ARM 64 bit CPU" );
unsigned int cpu_info[4] = { 0 };
cpuid( 0, 0, cpu_info );
sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] );
#else