mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
fc696dbbe5 | ||
![]() |
f3fde95f27 | ||
![]() |
0a78013cbe |
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
|
||||
Requirements
|
||||
------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
|
||||
- Arm CPU supporting AArch64 and NEON.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
32 bit CPUs are not supported.
|
||||
|
||||
ARM requirements (Beta):
|
||||
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
|
||||
|
||||
CPU: Armv8 and NEON, SHA2 & AES are optional
|
||||
OS: Linux distribution built for AArch64.
|
||||
Packages: source code only.
|
||||
Mining on mobile devices that meet the requirements is not recommended due to the risk of
|
||||
overheating and damaging the battery. Mining has unlimited demand, it will push any device
|
||||
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
|
||||
|
||||
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
|
||||
If a mobile CPU can mine it any CPU can.
|
||||
|
||||
See wiki for details.
|
||||
|
||||
@@ -73,6 +75,29 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v23.12
|
||||
|
||||
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
|
||||
|
||||
v23.11
|
||||
|
||||
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
|
||||
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
|
||||
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
|
||||
|
||||
v23.10
|
||||
|
||||
x86_64: Fixed scrypt, scryptn2 algos SSE2.
|
||||
Fixed sha512256d algo AVX2, SSE2, NEON.
|
||||
Fixed a bug in Skein N-way that reduced performance.
|
||||
ARM: Skein optimized for NEON, SHA2 & SSE2.
|
||||
Skein2 algo 2-way optimized for NEON & SSE2.
|
||||
|
||||
v23.9
|
||||
|
||||
x86_64: fixed minotaurx crash, broken in 23.7.
|
||||
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
|
||||
|
||||
v23.8
|
||||
|
||||
Cpuminer-opt is no longer dependant on OpenSSL.
|
||||
|
@@ -99,7 +99,7 @@ typedef uint32_t set_t;
|
||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // AArch64
|
||||
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||
#define NEON_OPT 1 << 11 // AArch64
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
|
@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
||||
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const v128_t shuf_bswap32 =
|
||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -475,11 +475,12 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
|
||||
void blake512_close( blake512_context *sc, void *dst )
|
||||
{
|
||||
unsigned char buf[128] __attribute__((aligned(32)));
|
||||
size_t ptr;
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
memcpy( buf, sc->buf, ptr );
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr] = 0x80;
|
||||
tl = sc->T0 + bit_len;
|
||||
@@ -519,7 +520,8 @@ void blake512_close( blake512_context *sc, void *dst )
|
||||
blake512_update( sc, buf, 128 );
|
||||
}
|
||||
|
||||
v128_block_bswap64_512( dst, sc->H );
|
||||
for ( k = 0; k < 8; k ++ )
|
||||
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
|
@@ -131,47 +131,7 @@
|
||||
V[7] = v128_alignr64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
v128_t *V = (v128_t*)v; \
|
||||
v128_t V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
|
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
|
||||
*/
|
||||
|
||||
#define ss0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 3) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 4), \
|
||||
mm128_rol_32( (x), 19) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 3) ), \
|
||||
v128_xor( v128_rol32( (x), 4), \
|
||||
v128_rol32( (x), 19) ) )
|
||||
|
||||
#define ss1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 8), \
|
||||
mm128_rol_32( (x), 23) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 8), \
|
||||
v128_rol32( (x), 23) ) )
|
||||
|
||||
#define ss2(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 1) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 12), \
|
||||
mm128_rol_32( (x), 25) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 1) ), \
|
||||
v128_xor( v128_rol32( (x), 12), \
|
||||
v128_rol32( (x), 25) ) )
|
||||
|
||||
#define ss3(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 15), \
|
||||
mm128_rol_32( (x), 29) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 15), \
|
||||
v128_rol32( (x), 29) ) )
|
||||
|
||||
#define ss4(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 1 ) )
|
||||
|
||||
#define ss5(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 2 ) )
|
||||
|
||||
#define rs1(x) mm128_rol_32( x, 3 )
|
||||
#define rs2(x) mm128_rol_32( x, 7 )
|
||||
#define rs3(x) mm128_rol_32( x, 13 )
|
||||
#define rs4(x) mm128_rol_32( x, 16 )
|
||||
#define rs5(x) mm128_rol_32( x, 19 )
|
||||
#define rs6(x) mm128_rol_32( x, 23 )
|
||||
#define rs7(x) mm128_rol_32( x, 27 )
|
||||
#define rs1(x) v128_rol32( x, 3 )
|
||||
#define rs2(x) v128_rol32( x, 7 )
|
||||
#define rs3(x) v128_rol32( x, 13 )
|
||||
#define rs4(x) v128_rol32( x, 16 )
|
||||
#define rs5(x) v128_rol32( x, 19 )
|
||||
#define rs6(x) v128_rol32( x, 23 )
|
||||
#define rs7(x) v128_rol32( x, 27 )
|
||||
|
||||
#define rol_off_32( M, j, off ) \
|
||||
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_s( M, H, j ) \
|
||||
_mm_xor_si128( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
v128_xor( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1s( qt, M, H, i ) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2s( qt, M, H, i) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
|
||||
// resulting in some sign changes compared to the reference code.
|
||||
|
||||
#define Ws0 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_add32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws1 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_xor( M[11], H[11] ) ), \
|
||||
v128_sub32( v128_xor( M[14], H[14] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws2 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws3 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_sub32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws4 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws5 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws6 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_sub32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws7 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_add32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws8 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
#define Ws9 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws10 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws11 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define Ws12 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[10], H[10] ) ) )
|
||||
|
||||
#define Ws13 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_add32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[11], H[11] ) ) )
|
||||
|
||||
#define Ws14 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[12], H[12] ) ) )
|
||||
|
||||
#define Ws15 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[ 4], H[4] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
|
||||
{
|
||||
__m128i qt[32], xl, xh; \
|
||||
v128u64_t qt[32], xl, xh; \
|
||||
|
||||
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
|
||||
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
|
||||
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
|
||||
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
|
||||
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
|
||||
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
|
||||
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = v128_add32( ss0( Ws10), H[11] );
|
||||
qt[11] = v128_add32( ss1( Ws11), H[12] );
|
||||
qt[12] = v128_add32( ss2( Ws12), H[13] );
|
||||
qt[13] = v128_add32( ss3( Ws13), H[14] );
|
||||
qt[14] = v128_add32( ss4( Ws14), H[15] );
|
||||
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
|
||||
qt[16] = expand1s( qt, M, H, 16 );
|
||||
qt[17] = expand1s( qt, M, H, 17 );
|
||||
qt[18] = expand2s( qt, M, H, 18 );
|
||||
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
qt[30] = expand2s( qt, M, H, 30 );
|
||||
qt[31] = expand2s( qt, M, H, 31 );
|
||||
|
||||
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm_xor_si128( xl, _mm_xor_si128(
|
||||
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = v128_xor( xl, v128_xor(
|
||||
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[0],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
|
||||
_mm_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[1],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
|
||||
_mm_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[2],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
|
||||
_mm_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[3],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
|
||||
_mm_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[4],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
|
||||
_mm_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[5],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
|
||||
_mm_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[6],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
|
||||
_mm_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[7],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
|
||||
_mm_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[4], 9 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
|
||||
_mm_xor_si128( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[5], 10 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[6], 11 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[7], 12 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[0], 13 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
|
||||
_mm_xor_si128( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[1], 14 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[2], 15 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
|
||||
_mm_xor_si128( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[3], 16 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
|
||||
_mm_xor_si128( qt[22], qt[15] ) ) );
|
||||
dH[ 0] = v128_add32(
|
||||
v128_xor( M[0],
|
||||
v128_xor( v128_sl32( xh, 5 ),
|
||||
v128_sr32( qt[16], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = v128_add32(
|
||||
v128_xor( M[1],
|
||||
v128_xor( v128_sr32( xh, 7 ),
|
||||
v128_sl32( qt[17], 8 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = v128_add32(
|
||||
v128_xor( M[2],
|
||||
v128_xor( v128_sr32( xh, 5 ),
|
||||
v128_sl32( qt[18], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = v128_add32(
|
||||
v128_xor( M[3],
|
||||
v128_xor( v128_sr32( xh, 1 ),
|
||||
v128_sl32( qt[19], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = v128_add32(
|
||||
v128_xor( M[4],
|
||||
v128_xor( v128_sr32( xh, 3 ),
|
||||
v128_sl32( qt[20], 0 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = v128_add32(
|
||||
v128_xor( M[5],
|
||||
v128_xor( v128_sl32( xh, 6 ),
|
||||
v128_sr32( qt[21], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = v128_add32(
|
||||
v128_xor( M[6],
|
||||
v128_xor( v128_sr32( xh, 4 ),
|
||||
v128_sl32( qt[22], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = v128_add32(
|
||||
v128_xor( M[7],
|
||||
v128_xor( v128_sr32( xh, 11 ),
|
||||
v128_sl32( qt[23], 2 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[4], 9 ),
|
||||
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
|
||||
v128_xor( v128_sl32( xl, 8 ),
|
||||
v128_xor( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[5], 10 ),
|
||||
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
|
||||
v128_xor( v128_sr32( xl, 6 ),
|
||||
v128_xor( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[6], 11 ),
|
||||
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
|
||||
v128_xor( v128_sl32( xl, 6 ),
|
||||
v128_xor( qt[17], qt[10] ) ) );
|
||||
dH[11] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[7], 12 ),
|
||||
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
|
||||
v128_xor( v128_sl32( xl, 4 ),
|
||||
v128_xor( qt[18], qt[11] ) ) );
|
||||
dH[12] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[0], 13 ),
|
||||
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
|
||||
v128_xor( v128_sr32( xl, 3 ),
|
||||
v128_xor( qt[19], qt[12] ) ) );
|
||||
dH[13] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[1], 14 ),
|
||||
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
|
||||
v128_xor( v128_sr32( xl, 4 ),
|
||||
v128_xor( qt[20], qt[13] ) ) );
|
||||
dH[14] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[2], 15 ),
|
||||
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
|
||||
v128_xor( v128_sr32( xl, 7 ),
|
||||
v128_xor( qt[21], qt[14] ) ) );
|
||||
dH[15] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[3], 16 ),
|
||||
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
|
||||
v128_xor( v128_sr32( xl, 2 ),
|
||||
v128_xor( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
static const uint32_t final_s[16][4] =
|
||||
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
|
||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||
};
|
||||
/*
|
||||
static const __m128i final_s[16] =
|
||||
static const v128u64_t final_s[16] =
|
||||
{
|
||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
|
||||
*/
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->H[ 0] = v128_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = v128_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = v128_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = v128_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = v128_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = v128_64( 0x6465666764656667 );
|
||||
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = v128_64( 0x7071727370717273 );
|
||||
ctx->H[13] = v128_64( 0x7475767774757677 );
|
||||
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
|
||||
|
||||
|
||||
// for ( int i = 0; i < 16; i++ )
|
||||
// sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
// sc->H[i] = v128_32( iv[i] );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
static void
|
||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
__m128i *buf;
|
||||
__m128i htmp[16];
|
||||
__m128i *h1, *h2;
|
||||
v128u64_t *vdata = (v128u64_t*)data;
|
||||
v128u64_t *buf;
|
||||
v128u64_t htmp[16];
|
||||
v128u64_t *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
vdata += ( clen >> 2 );
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m128i *ht;
|
||||
v128u64_t *ht;
|
||||
compress_small( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
|
||||
|
||||
if ( h1 != sc->H )
|
||||
memcpy_128( sc->H, h1, 16 );
|
||||
v128_memcpy( sc->H, h1, 16 );
|
||||
}
|
||||
|
||||
static void
|
||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32)
|
||||
{
|
||||
__m128i *buf;
|
||||
__m128i h1[16], h2[16], *h;
|
||||
v128u64_t *buf;
|
||||
v128u64_t h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = sc->H;
|
||||
|
||||
// assume bit_count fits in 32 bits
|
||||
if ( ptr > buf_size - 4 )
|
||||
{
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
compress_small( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = m128_zero;
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = v128_zero;
|
||||
compress_small( buf, h, h2 );
|
||||
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
|
||||
compress_small( buf, (__m128i*)final_s, h1 );
|
||||
compress_small( buf, (v128u64_t*)final_s, h1 );
|
||||
|
||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||
casti_m128i( dst, u ) = h1[v];
|
||||
casti_v128( dst, u ) = h1[v];
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -2,12 +2,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph_keccak.h"
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way(void *state, const void *input)
|
||||
void bmw512hash_8way( void *state, const void *input )
|
||||
{
|
||||
bmw512_8way_context ctx;
|
||||
bmw512_8way_init( &ctx );
|
||||
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
|
||||
void bmw512hash_4way(void *state, const void *input)
|
||||
void bmw512hash_4way( void *state, const void *input )
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
bmw512_4way_init( &ctx );
|
||||
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
do {
|
||||
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BMW512_2WAY)
|
||||
|
||||
void bmw512hash_2x64( void *state, const void *input )
|
||||
{
|
||||
bmw512_2x64_context ctx;
|
||||
bmw512_2x64_init( &ctx );
|
||||
bmw512_2x64_update( &ctx, input, 80 );
|
||||
bmw512_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = v128_intrlv_blend_32( v128_bswap32(
|
||||
v128_set32( n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
bool register_bmw512_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (BMW512_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
|
||||
#elif defined (BMW512_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||
gate->hash = (void*)&bmw512hash_4way;
|
||||
#elif defined (BMW512_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_2x64;
|
||||
gate->hash = (void*)&bmw512hash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_bmw512;
|
||||
gate->hash = (void*)&bmw512hash;
|
||||
|
@@ -8,19 +8,27 @@
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define BMW512_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way( void *state, const void *input );
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
void bmw512hash_4way( void *state, const void *input );
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_2WAY)
|
||||
|
||||
void bmw512hash_2x64( void *state, const void *input );
|
||||
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
||||
HashReturn update_echo( hashState_echo *state, const void *data,
|
||||
uint32_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
HashReturn final_echo( hashState_echo *state, void *hashval)
|
||||
{
|
||||
v128_t remainingbits;
|
||||
|
||||
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||
const void *data, uint32_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength datalen )
|
||||
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||
int nHashSize, const void *data, uint32_t datalen )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
memcpy(state->buffer, data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
HashReturn hRet;
|
||||
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
|
||||
|
||||
HashReturn reinit_echo(hashState_echo *state);
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
|
||||
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
|
||||
HashReturn final_echo(hashState_echo *state, void *hashval);
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
|
||||
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength databitlen );
|
||||
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||
const void *data, uint32_t databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||
int nHashSize, const void *data, uint32_t databitlen );
|
||||
|
||||
#endif // HASH_API_H
|
||||
|
||||
|
@@ -36,7 +36,6 @@
|
||||
|
||||
#include "sph_echo.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
|
@@ -36,8 +36,6 @@
|
||||
#ifndef SPH_ECHO_H__
|
||||
#define SPH_ECHO_H__
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -146,7 +146,7 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
@@ -162,16 +162,16 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t4 = v128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t4 = v128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
@@ -188,7 +188,7 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
@@ -485,7 +485,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
ctx->uBlockLength = 4;
|
||||
|
||||
for(i = 0; i < 6; i++)
|
||||
ctx->state[i] = m128_zero;
|
||||
ctx->state[i] = v128_zero;
|
||||
|
||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
||||
|
@@ -66,7 +66,40 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
||||
|
||||
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
|
||||
|
||||
//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
/*
|
||||
#define TRANSP_MASK \
|
||||
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
|
||||
#define SUBSH_MASK0 \
|
||||
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
|
||||
#define SUBSH_MASK1 \
|
||||
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
|
||||
#define SUBSH_MASK2 \
|
||||
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
|
||||
#define SUBSH_MASK3 \
|
||||
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
|
||||
#define SUBSH_MASK4 \
|
||||
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
|
||||
#define SUBSH_MASK5 \
|
||||
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
|
||||
#define SUBSH_MASK6 \
|
||||
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
|
||||
#define SUBSH_MASK7 \
|
||||
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
|
||||
|
||||
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
|
||||
|
||||
|
||||
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
|
||||
c07, c06, c05, c04, c03, c02, c01, c00 ) \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
|
||||
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
|
||||
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
|
||||
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
|
||||
*/
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
|
||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
|
@@ -35,8 +35,6 @@
|
||||
|
||||
#include "sph_groestl.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -42,7 +42,6 @@ extern "C"{
|
||||
#include <stddef.h>
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
/**
|
||||
* Output size (in bits) for Groestl-224.
|
||||
*/
|
||||
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -38,7 +38,7 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// SSE2 or NEON Hamsi-512 2x64
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Hamsi-512 4x64
|
||||
|
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_2WAY)
|
||||
|
||||
void keccakhash_2x64(void *state, const void *input)
|
||||
{
|
||||
keccak256_2x64_context ctx;
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, input, 80 );
|
||||
keccak256_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
keccakhash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ))
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#elif defined (KECCAK_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||
gate->hash = (void*)&keccakhash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#elif defined (KECCAK_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||
gate->hash = (void*)&keccakhash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
bool register_sha3d_algo( algo_gate_t* gate )
|
||||
{
|
||||
hard_coded_eb = 6;
|
||||
// opt_extranonce = false;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
|
||||
#if defined (KECCAK_8WAY)
|
||||
#if defined (SHA3D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_8way;
|
||||
gate->hash = (void*)&sha3d_hash_8way;
|
||||
#elif defined (KECCAK_4WAY)
|
||||
#elif defined (SHA3D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_4way;
|
||||
gate->hash = (void*)&sha3d_hash_4way;
|
||||
#elif defined (SHA3D_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_2x64;
|
||||
gate->hash = (void*)&sha3d_hash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha3d;
|
||||
gate->hash = (void*)&sha3d_hash;
|
||||
|
@@ -8,6 +8,16 @@
|
||||
#define KECCAK_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define KECCAK_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define KECCAK_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA3D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA3D_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA3D_2WAY 1
|
||||
#endif
|
||||
|
||||
extern int hard_coded_eb;
|
||||
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
|
||||
|
||||
void keccakhash_8way( void *state, const void *input );
|
||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_8way( void *state, const void *input );
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void keccakhash_4way( void *state, const void *input );
|
||||
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_4way( void *state, const void *input );
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#elif defined(KECCAK_2WAY)
|
||||
|
||||
void keccakhash_2x64( void *state, const void *input );
|
||||
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void keccakhash( void *state, const void *input );
|
||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA3D_8WAY)
|
||||
|
||||
void sha3d_hash_8way( void *state, const void *input );
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SHA3D_4WAY)
|
||||
|
||||
void sha3d_hash_4way( void *state, const void *input );
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SHA3D_2WAY)
|
||||
|
||||
void sha3d_hash_2x64( void *state, const void *input );
|
||||
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void sha3d_hash( void *state, const void *input );
|
||||
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
#if defined(SHA3D_8WAY)
|
||||
|
||||
void sha3d_hash_8way(void *state, const void *input)
|
||||
{
|
||||
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
#elif defined(SHA3D_4WAY)
|
||||
|
||||
void sha3d_hash_4way(void *state, const void *input)
|
||||
{
|
||||
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA3D_2WAY)
|
||||
|
||||
void sha3d_hash_2x64(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||
keccak256_2x64_context ctx;
|
||||
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, input, 80 );
|
||||
keccak256_2x64_close( &ctx, buffer );
|
||||
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, buffer, 32 );
|
||||
keccak256_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
sha3d_hash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -75,16 +75,16 @@
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
v128_t t = a0; \
|
||||
a0 = mm128_xoror( a3, a0, a1 ); \
|
||||
a0 = v128_xoror( a3, a0, a1 ); \
|
||||
a2 = v128_xor( a2, a3 ); \
|
||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm128_xorand( a2, a3, t ); \
|
||||
a2 = mm128_xorand( a1, a2, a0 ); \
|
||||
a3 = v128_xorand( a2, a3, t ); \
|
||||
a2 = v128_xorand( a1, a2, a0 ); \
|
||||
a1 = v128_or( a1, a3 ); \
|
||||
a3 = v128_xor( a3, a2 ); \
|
||||
t = v128_xor( t, a1 ); \
|
||||
a2 = v128_and( a2, a1 ); \
|
||||
a1 = mm128_xnor( a1, a0 ); \
|
||||
a1 = v128_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
|
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_hmq1725_4way;
|
||||
gate->hash = (void*)&hmq1725_4way_hash;
|
||||
#else
|
||||
init_hmq1725_ctx();
|
||||
gate->scanhash = (void*)&scanhash_hmq1725;
|
||||
gate->hash = (void*)&hmq1725hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
||||
void hmq1725hash( void *state, const void *input );
|
||||
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_hmq1725_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -4,346 +4,267 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
|
||||
typedef struct {
|
||||
sph_blake512_context blake1, blake2;
|
||||
sph_bmw512_context bmw1, bmw2, bmw3;
|
||||
sph_skein512_context skein1, skein2;
|
||||
sph_jh512_context jh1, jh2;
|
||||
sph_keccak512_context keccak1, keccak2;
|
||||
hashState_luffa luffa1, luffa2;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite1, shavite2;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd1, simd2;
|
||||
#else
|
||||
hashState_sd simd1, simd2;
|
||||
#endif
|
||||
sph_hamsi512_context hamsi1;
|
||||
sph_shabal512_context shabal1;
|
||||
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
|
||||
sph_sha512_context sha1, sha2;
|
||||
sph_haval256_5_context haval1, haval2;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo1, echo2;
|
||||
hashState_groestl groestl1, groestl2;
|
||||
hashState_fugue fugue1, fugue2;
|
||||
#else
|
||||
sph_groestl512_context groestl1, groestl2;
|
||||
sph_echo512_context echo1, echo2;
|
||||
sph_fugue512_context fugue1, fugue2;
|
||||
#endif
|
||||
} hmq1725_ctx_holder;
|
||||
|
||||
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
|
||||
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
|
||||
|
||||
void init_hmq1725_ctx()
|
||||
union _hmq1725_ctx_holder
|
||||
{
|
||||
sph_blake512_init(&hmq1725_ctx.blake1);
|
||||
sph_blake512_init(&hmq1725_ctx.blake2);
|
||||
|
||||
sph_bmw512_init(&hmq1725_ctx.bmw1);
|
||||
sph_bmw512_init(&hmq1725_ctx.bmw2);
|
||||
sph_bmw512_init(&hmq1725_ctx.bmw3);
|
||||
|
||||
sph_skein512_init(&hmq1725_ctx.skein1);
|
||||
sph_skein512_init(&hmq1725_ctx.skein2);
|
||||
|
||||
sph_jh512_init(&hmq1725_ctx.jh1);
|
||||
sph_jh512_init(&hmq1725_ctx.jh2);
|
||||
|
||||
sph_keccak512_init(&hmq1725_ctx.keccak1);
|
||||
sph_keccak512_init(&hmq1725_ctx.keccak2);
|
||||
|
||||
init_luffa( &hmq1725_ctx.luffa1, 512 );
|
||||
init_luffa( &hmq1725_ctx.luffa2, 512 );
|
||||
|
||||
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
|
||||
|
||||
sph_shavite512_init(&hmq1725_ctx.shavite1);
|
||||
sph_shavite512_init(&hmq1725_ctx.shavite2);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init(&hmq1725_ctx.simd1);
|
||||
sph_simd512_init(&hmq1725_ctx.simd2);
|
||||
#else
|
||||
init_sd( &hmq1725_ctx.simd1, 512 );
|
||||
init_sd( &hmq1725_ctx.simd2, 512 );
|
||||
#endif
|
||||
|
||||
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
|
||||
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
|
||||
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
|
||||
hashState_groestl groestl;
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_init(&hmq1725_ctx.fugue1);
|
||||
sph_fugue512_init(&hmq1725_ctx.fugue2);
|
||||
sph_groestl512_context groestl;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
|
||||
sph_shabal512_init(&hmq1725_ctx.shabal1);
|
||||
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
|
||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
|
||||
|
||||
sph_sha512_init( &hmq1725_ctx.sha1 );
|
||||
sph_sha512_init( &hmq1725_ctx.sha2 );
|
||||
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval1);
|
||||
sph_haval256_5_init(&hmq1725_ctx.haval2);
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &hmq1725_ctx.echo1, 512 );
|
||||
init_echo( &hmq1725_ctx.echo2, 512 );
|
||||
init_groestl( &hmq1725_ctx.groestl1, 64 );
|
||||
init_groestl( &hmq1725_ctx.groestl2, 64 );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl1 );
|
||||
sph_groestl512_init( &hmq1725_ctx.groestl2 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo1 );
|
||||
sph_echo512_init( &hmq1725_ctx.echo2 );
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
}
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_sha512_context sha;
|
||||
sph_haval256_5_context haval;
|
||||
};
|
||||
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
|
||||
|
||||
void hmq_bmw512_midstate( const void* input )
|
||||
{
|
||||
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
|
||||
sph_bmw512( &hmq_bmw_mid, input, 64 );
|
||||
}
|
||||
|
||||
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
|
||||
//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
|
||||
//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
|
||||
|
||||
extern void hmq1725hash(void *state, const void *input)
|
||||
{
|
||||
const uint32_t mask = 24;
|
||||
uint32_t hashA[32] __attribute__((aligned(64)));
|
||||
uint32_t hashB[32] __attribute__((aligned(64)));
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
uint32_t hashA[32] __attribute__((aligned(32)));
|
||||
uint32_t hashB[32] __attribute__((aligned(32)));
|
||||
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
sph_bmw512( &ctx.bmw, input, 80 );
|
||||
sph_bmw512_close( &ctx.bmw, hashA ); //1
|
||||
|
||||
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
|
||||
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
|
||||
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
|
||||
|
||||
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
|
||||
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
|
||||
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
|
||||
|
||||
if ( hashB[0] & mask ) //1
|
||||
{
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
|
||||
(const char*)hashB, 512 );
|
||||
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
|
||||
#else
|
||||
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
|
||||
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
|
||||
sph_groestl512_close( &ctx.groestl, hashA ); //2
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
|
||||
sph_skein512_close(&h_ctx.skein1, hashA); //2
|
||||
sph_skein512_init( &ctx.skein );
|
||||
sph_skein512( &ctx.skein, hashB, 64 ); //1
|
||||
sph_skein512_close( &ctx.skein, hashA ); //2
|
||||
}
|
||||
|
||||
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
|
||||
sph_jh512_close(&h_ctx.jh1, hashB); //4
|
||||
sph_jh512_init( &ctx.jh );
|
||||
sph_jh512( &ctx.jh, hashA, 64 ); //3
|
||||
sph_jh512_close( &ctx.jh, hashB ); //4
|
||||
|
||||
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
|
||||
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
|
||||
sph_keccak512_init( &ctx.keccak );
|
||||
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
|
||||
sph_keccak512_close( &ctx.keccak, hashA ); //3
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
sph_blake512 (&h_ctx.blake1, hashA, 64); //
|
||||
sph_blake512_close(&h_ctx.blake1, hashB); //5
|
||||
blake512_init( &ctx.blake );
|
||||
blake512_update( &ctx.blake, hashA, 64 );
|
||||
blake512_close( &ctx.blake, hashB );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
|
||||
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
|
||||
sph_bmw512_close( &ctx.bmw, hashB ); //5
|
||||
}
|
||||
|
||||
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
|
||||
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
|
||||
|
||||
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
|
||||
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
|
||||
|
||||
if ( hashB[0] & mask ) //7
|
||||
{
|
||||
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
|
||||
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
|
||||
sph_keccak512_init( &ctx.keccak );
|
||||
sph_keccak512( &ctx.keccak, hashB, 64 ); //
|
||||
sph_keccak512_close( &ctx.keccak, hashA ); //8
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
|
||||
sph_jh512_close(&h_ctx.jh2, hashA); //8
|
||||
sph_jh512_init( &ctx.jh );
|
||||
sph_jh512( &ctx.jh, hashB, 64 ); //7
|
||||
sph_jh512_close( &ctx.jh, hashA ); //8
|
||||
}
|
||||
|
||||
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
|
||||
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
|
||||
sph_shavite512_close( &ctx.shavite, hashB ); //4
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
|
||||
sph_simd512_close(&h_ctx.simd1, hashA); //4
|
||||
#else
|
||||
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
|
||||
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
|
||||
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
|
||||
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
|
||||
sph_haval256_5_init( &ctx.haval );
|
||||
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
|
||||
sph_haval256_5_close( &ctx.haval, hashB ); //5
|
||||
memset(&hashB[8], 0, 32);
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
|
||||
#else
|
||||
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
|
||||
sph_echo512_close(&h_ctx.echo1, hashA); //6
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, hashB, 64 ); //5
|
||||
sph_echo512_close( &ctx.echo, hashA ); //6
|
||||
#endif
|
||||
|
||||
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
|
||||
sph_blake512_close(&h_ctx.blake2, hashB); //7
|
||||
blake512_init( &ctx.blake );
|
||||
blake512_update( &ctx.blake, hashA, 64 );
|
||||
blake512_close( &ctx.blake, hashB );
|
||||
|
||||
if ( hashB[0] & mask ) //7
|
||||
{
|
||||
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
|
||||
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hashB, 64 ); //
|
||||
sph_shavite512_close( &ctx.shavite, hashA ); //8
|
||||
}
|
||||
else
|
||||
{
|
||||
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
|
||||
}
|
||||
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
|
||||
|
||||
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
|
||||
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
|
||||
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
|
||||
|
||||
#if defined(__AES__)
|
||||
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
|
||||
fugue512_Final( &h_ctx.fugue1, hashA ); //3
|
||||
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||
#else
|
||||
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
|
||||
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
|
||||
sph_fugue512_close( &ctx.fugue, hashA ); //3
|
||||
#endif
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
|
||||
(const BitSequence *)hashA, 512 );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
|
||||
#else
|
||||
sph_echo512 (&h_ctx.echo2, hashA, 64); //
|
||||
sph_echo512_close(&h_ctx.echo2, hashB); //5
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, hashA, 64 ); //
|
||||
sph_echo512_close( &ctx.echo, hashB ); //5
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&h_ctx.simd2, hashA, 64); //6
|
||||
sph_simd512_close(&h_ctx.simd2, hashB); //7
|
||||
#else
|
||||
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
|
||||
(const BitSequence *)hashA, 512 );
|
||||
#endif
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
|
||||
|
||||
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
|
||||
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
|
||||
sph_shabal512_close( &ctx.shabal, hashA ); //6
|
||||
|
||||
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
|
||||
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
|
||||
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
|
||||
|
||||
if ( hashB[0] & mask ) //7
|
||||
{
|
||||
#if defined(__AES__)
|
||||
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
|
||||
fugue512_Final( &h_ctx.fugue2, hashA ); //8
|
||||
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||
#else
|
||||
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
|
||||
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hashB, 64 ); //
|
||||
sph_fugue512_close( &ctx.fugue, hashA ); //8
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_sha512( &h_ctx.sha1, hashB, 64 );
|
||||
sph_sha512_close( &h_ctx.sha1, hashA );
|
||||
sph_sha512_init( &ctx.sha );
|
||||
sph_sha512( &ctx.sha, hashB, 64 );
|
||||
sph_sha512_close( &ctx.sha, hashA );
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
|
||||
(const char*)hashA, 512 );
|
||||
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
|
||||
#else
|
||||
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
|
||||
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
|
||||
sph_groestl512_close( &ctx.groestl, hashB ); //4
|
||||
#endif
|
||||
|
||||
sph_sha512( &h_ctx.sha2, hashB, 64 );
|
||||
sph_sha512_close( &h_ctx.sha2, hashA );
|
||||
sph_sha512_init( &ctx.sha );
|
||||
sph_sha512( &ctx.sha, hashB, 64 );
|
||||
sph_sha512_close( &ctx.sha, hashA );
|
||||
|
||||
if ( hashA[0] & mask ) //4
|
||||
{
|
||||
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
|
||||
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
|
||||
memset(&hashB[8], 0, 32);
|
||||
sph_haval256_5_init( &ctx.haval );
|
||||
sph_haval256_5( &ctx.haval, hashA, 64 ); //
|
||||
sph_haval256_5_close( &ctx.haval, hashB ); //5
|
||||
memset( &hashB[8], 0, 32 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
|
||||
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
|
||||
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
|
||||
}
|
||||
|
||||
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
|
||||
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
|
||||
sph_bmw512_close( &ctx.bmw, hashA ); //6
|
||||
|
||||
memcpy(state, hashA, 32);
|
||||
memcpy( state, hashA, 32 );
|
||||
}
|
||||
|
||||
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
// uint32_t endiandata[32] __attribute__((aligned(64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(32)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
@@ -356,7 +277,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
for (int k = 0; k < 20; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
hmq_bmw512_midstate( endiandata );
|
||||
// hmq_bmw512_midstate( endiandata );
|
||||
|
||||
// if (opt_debug)
|
||||
// {
|
||||
|
@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
||||
|
||||
#define F3(x, y, z) \
|
||||
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
|
||||
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
|
||||
|
||||
#define F4(x, y, z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
||||
|
||||
#define F5(x, y, z) \
|
||||
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
|
||||
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
|
||||
|
||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
|
@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
||||
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
v128_t vmask, targ, hash;
|
||||
int t6_mask, flip;
|
||||
v128_t W[16]; memcpy_128( W, data, 16 );
|
||||
v128_t W[16]; v128_memcpy( W, data, 16 );
|
||||
|
||||
A = v128_load( state_in );
|
||||
B = v128_load( state_in+1 );
|
||||
|
@@ -5,11 +5,11 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA512256D_8WAY 1
|
||||
#define SHA512256D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA512256D_4WAY 1
|
||||
#define SHA512256D_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA512256D_2WAY 1
|
||||
#define SHA512256D_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SHA512256D_8WAY)
|
||||
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = v256_64( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
|
||||
do
|
||||
{
|
||||
sha512256d_4way_init( &ctx );
|
||||
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
|
||||
v128u64_t *noncev = (v128u64_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u64_t two = v128_64( 0x0000000200000000 );
|
||||
const v128_t two = v128_64( 0x0000000200000000 );
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
|
||||
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool register_sha512256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SHA512256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha512256d_8way;
|
||||
#elif defined(SHA512256D_4WAY)
|
||||
|
@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
|
||||
static void
|
||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const v128_t zero = v128_zero;
|
||||
__m256i p0, p1, p2, p3, x;
|
||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m256i *m = (__m256i*)msg;
|
||||
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
{
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
v128_t *iv = (v128_t*)IV512;
|
||||
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
v128_t *iv = (v128_t*)IV512;
|
||||
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
static __thread skein512_2x64_context skein512_2x64_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skeinhash_2x64( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hash0[16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[16] __attribute__ ((aligned (32)));
|
||||
skein512_2x64_context ctx_skein;
|
||||
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
|
||||
|
||||
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
|
||||
|
||||
dintrlv_2x64( hash0, hash1, vhash64, 512 );
|
||||
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
|
||||
intrlv_2x32( state, hash0, hash1, 256 );
|
||||
}
|
||||
|
||||
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hash[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash_d7 = &(hash[7<<1]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
v128u32_t *noncev = (v128u32_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
skeinhash_2x64( hash, vdata );
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
|
||||
{
|
||||
extr_lane_2x32( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -3,16 +3,20 @@
|
||||
|
||||
bool register_skein_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SKEIN_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_8way;
|
||||
gate->hash = (void*)&skeinhash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#elif defined(SKEIN_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#elif defined(SKEIN_2WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_2x64;
|
||||
gate->hash = (void*)&skeinhash_2x64;
|
||||
#else
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_skein2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SKEIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_8way;
|
||||
gate->hash = (void*)&skein2hash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
#elif defined(SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_4way;
|
||||
gate->hash = (void*)&skein2hash_4way;
|
||||
#elif defined(SKEIN_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein2;
|
||||
gate->hash = (void*)&skein2hash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
@@ -7,6 +7,8 @@
|
||||
#define SKEIN_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SKEIN_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SKEIN_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SKEIN_8WAY)
|
||||
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
void skeinhash_2x64( void *output, const void *input );
|
||||
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void skein2hash_2x64( void *output, const void *input );
|
||||
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void skeinhash( void *output, const void *input );
|
||||
|
@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_8WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_8WAY( et, ptr );
|
||||
}
|
||||
|
||||
memset_zero_512( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
}
|
||||
|
||||
memset_zero_256( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_2WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_2WAY( et, ptr );
|
||||
}
|
||||
|
||||
v128_memset_zero( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
|
@@ -5,19 +5,6 @@
|
||||
|
||||
#if defined(SKEIN_8WAY)
|
||||
|
||||
static __thread skein512_8way_context skein512_8way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skein2hash_8way( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
skein512_8way_context ctx;
|
||||
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
|
||||
|
||||
skein512_8way_final16( &ctx, hash, input + (64*8) );
|
||||
skein512_8way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
static __thread skein512_4way_context skein512_4way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skein2hash_4way( void *output, const void *input )
|
||||
{
|
||||
skein512_4way_context ctx;
|
||||
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
|
||||
uint64_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
|
||||
skein512_4way_final16( &ctx, hash, input + (64*4) );
|
||||
skein512_4way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[8*2] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
|
||||
skein512_2x64_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
v128u64_t *noncev = (v128u64_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u64_t two = v128_64( 0x0000000200000000 );
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &ctx, vdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
|
||||
skein512_2x64_full( &ctx, hash, hash, 64 );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( hash_q3[ lane ] <= targ_q3 )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, two );
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
|
||||
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
|
||||
*(__m256i*)hash, *(__m256i*)blob_off ), k );
|
||||
|
||||
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
|
||||
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#define MULXOR \
|
||||
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
|
||||
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||
/ VH_BYTE_ALIGNMENT ) + 1;
|
||||
#if defined (__AVX2__)
|
||||
const __m256i k = _mm256_set1_epi32( 0x1000193 );
|
||||
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
|
||||
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
const v128u32_t k = v128_32( 0x1000193 );
|
||||
#endif
|
||||
|
||||
|
@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
|
||||
{
|
||||
opt_target_factor = 256.0;
|
||||
gate->scanhash = (void*)&scanhash_verthash;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
|
||||
|
||||
const char *verthash_data_file = opt_data_file ? opt_data_file
|
||||
: default_verthash_data_file;
|
||||
|
@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
|
||||
*sptr = '\0';
|
||||
}
|
||||
|
||||
static __thread x16r_context_overlay hex_ctx;
|
||||
|
||||
int hex_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x16r_context_overlay ctx;
|
||||
memcpy( &ctx, &hex_ctx, sizeof(ctx) );
|
||||
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
|
||||
void *in = (void*) input;
|
||||
int size = 80;
|
||||
|
||||
@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__)
|
||||
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
|
||||
groestl512_full( &ctx.groestl, hash, in, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in, size );
|
||||
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
|
||||
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
|
||||
else
|
||||
{
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
|
||||
shavite512_full( &ctx.shavite, hash, in, size );
|
||||
break;
|
||||
case SIMD:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
init_sd( &ctx.simd, 512 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, in, size<<3 );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, (BitSequence *)hash, 512,
|
||||
(const BitSequence *)in, size );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hash, 512, in, size );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
sph_jh512_init( &hex_ctx.jh );
|
||||
sph_jh512( &hex_ctx.jh, edata, 64 );
|
||||
sph_jh512_init( &x16r_ref_ctx.jh );
|
||||
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
|
||||
break;
|
||||
case SKEIN:
|
||||
sph_skein512_init( &hex_ctx.skein );
|
||||
sph_skein512( &hex_ctx.skein, edata, 64 );
|
||||
sph_skein512_init( &x16r_ref_ctx.skein );
|
||||
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
|
||||
break;
|
||||
case LUFFA:
|
||||
init_luffa( &hex_ctx.luffa, 512 );
|
||||
update_luffa( &hex_ctx.luffa, edata, 64 );
|
||||
init_luffa( &x16r_ref_ctx.luffa, 512 );
|
||||
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
cubehashInit( &hex_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &hex_ctx.cube, edata, 64 );
|
||||
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
|
||||
break;
|
||||
case HAMSI:
|
||||
sph_hamsi512_init( &hex_ctx.hamsi );
|
||||
sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
|
||||
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
|
||||
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
|
||||
break;
|
||||
case SHABAL:
|
||||
sph_shabal512_init( &hex_ctx.shabal );
|
||||
sph_shabal512( &hex_ctx.shabal, edata, 64 );
|
||||
sph_shabal512_init( &x16r_ref_ctx.shabal );
|
||||
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
sph_whirlpool_init( &hex_ctx.whirlpool );
|
||||
sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
|
||||
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
|
||||
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
|
||||
break;
|
||||
}
|
||||
|
||||
|
@@ -11,29 +11,32 @@
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
//#if defined(__aarch64__)
|
||||
// #include "algo/simd/sph_simd.h"
|
||||
//#endif
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/yespower/yespower.h"
|
||||
//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
//#else
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
//#endif
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
|
||||
// Config
|
||||
#define MINOTAUR_ALGO_COUNT 16
|
||||
@@ -48,12 +51,15 @@ typedef struct TortureGarden TortureGarden;
|
||||
struct TortureGarden
|
||||
{
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
@@ -67,11 +73,7 @@ struct TortureGarden
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
hashState_luffa luffa;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -93,9 +95,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
switch ( algo )
|
||||
{
|
||||
case 0:
|
||||
blake512_init( &garden->blake );
|
||||
blake512_update( &garden->blake, input, 64 );
|
||||
blake512_close( &garden->blake, hash );
|
||||
blake512_full( &garden->blake, hash, input, 64 );
|
||||
break;
|
||||
case 1:
|
||||
sph_bmw512_init( &garden->bmw );
|
||||
@@ -107,7 +107,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
cubehashUpdateDigest( &garden->cube, hash, input, 64 );
|
||||
break;
|
||||
case 3:
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &garden->echo, hash, 512, input, 64 );
|
||||
#else
|
||||
sph_echo512_init( &garden->echo );
|
||||
@@ -165,13 +165,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_shavite512_close( &garden->shavite, hash );
|
||||
break;
|
||||
case 13:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &garden->simd );
|
||||
sph_simd512( &garden->simd, input, 64);
|
||||
sph_simd512_close( &garden->simd, hash );
|
||||
#else
|
||||
simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
|
||||
#endif
|
||||
simd512_ctx( &garden->simd, hash, input, 64 );
|
||||
break;
|
||||
case 14:
|
||||
sph_skein512_init( &garden->skein );
|
||||
|
@@ -971,4 +971,405 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X16R_2WAY)
|
||||
|
||||
void x16r_2x64_prehash( void *vdata, void *pdata )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
jh512_2x64_init( &x16r_ctx.jh );
|
||||
jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
keccak512_2x64_init( &x16r_ctx.keccak );
|
||||
keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
|
||||
break;
|
||||
case SKEIN:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
|
||||
break;
|
||||
case LUFFA:
|
||||
{
|
||||
v128_bswap32_80( edata, pdata );
|
||||
init_luffa( &x16r_ctx.luffa, 512 );
|
||||
update_luffa( &x16r_ctx.luffa, edata, 64 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
}
|
||||
break;
|
||||
case CUBEHASH:
|
||||
{
|
||||
v128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ctx.cube, edata, 64 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
}
|
||||
break;
|
||||
case HAMSI:
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
hamsi512_2x64_init( &x16r_ctx.hamsi );
|
||||
hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
|
||||
#else
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_hamsi512_init( &x16r_ctx.hamsi );
|
||||
sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
#if defined(__AES__)
|
||||
fugue512_init( &x16r_ctx.fugue );
|
||||
fugue512_update( &x16r_ctx.fugue, edata, 76 );
|
||||
#else
|
||||
sph_fugue512_init( &x16r_ctx.fugue );
|
||||
sph_fugue512( &x16r_ctx.fugue, edata, 76 );
|
||||
#endif
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
case SHABAL:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_shabal512_init( &x16r_ctx.shabal );
|
||||
sph_shabal512( &x16r_ctx.shabal, edata, 64);
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_whirlpool_init( &x16r_ctx.whirlpool );
|
||||
sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
default:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
}
|
||||
}
|
||||
|
||||
int x16r_2x64_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[20] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (32)));
|
||||
x16r_2x64_context_overlay ctx;
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
void *in0 = (void*) hash0;
|
||||
void *in1 = (void*) hash1;
|
||||
int size = 80;
|
||||
|
||||
dintrlv_2x64( hash0, hash1, input, 640 );
|
||||
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
{
|
||||
const char elem = x16r_hash_order[i];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
if ( i == 0 )
|
||||
blake512_2x64_full( &ctx.blake, vhash, input, size );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
|
||||
}
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case BMW:
|
||||
bmw512_2x64_init( &ctx.bmw );
|
||||
if ( i == 0 )
|
||||
bmw512_2x64_update( &ctx.bmw, input, size );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
bmw512_2x64_update( &ctx.bmw, vhash, size );
|
||||
}
|
||||
bmw512_2x64_close( &ctx.bmw, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
|
||||
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in0, size );
|
||||
sph_groestl512_close( &ctx.groestl, hash0 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in1, size );
|
||||
sph_groestl512_close( &ctx.groestl, hash1 );
|
||||
#endif
|
||||
break;
|
||||
case JH:
|
||||
if ( i == 0 )
|
||||
jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
jh512_2x64_init( &ctx.jh );
|
||||
jh512_2x64_update( &ctx.jh, vhash, size );
|
||||
}
|
||||
jh512_2x64_close( &ctx.jh, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case KECCAK:
|
||||
if ( i == 0 )
|
||||
keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
keccak512_2x64_init( &ctx.keccak );
|
||||
keccak512_2x64_update( &ctx.keccak, vhash, size );
|
||||
}
|
||||
keccak512_2x64_close( &ctx.keccak, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case SKEIN:
|
||||
if ( i == 0 )
|
||||
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
|
||||
}
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
|
||||
}
|
||||
else
|
||||
{
|
||||
luffa_full( &ctx.luffa, hash0, 512, hash0, size );
|
||||
luffa_full( &ctx.luffa, hash1, 512, hash1, size );
|
||||
}
|
||||
break;
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
|
||||
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
shavite512_full( &ctx.shavite, hash0, in0, size );
|
||||
shavite512_full( &ctx.shavite, hash1, in1, size );
|
||||
break;
|
||||
case SIMD:
|
||||
simd512_ctx( &ctx.simd, hash0, in0, size );
|
||||
simd512_ctx( &ctx.simd, hash1, in1, size );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, hash0, 512, in0, size );
|
||||
echo_full( &ctx.echo, hash1, 512, in1, size );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in0, size );
|
||||
sph_echo512_close( &ctx.echo, hash0 );
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in1, size );
|
||||
sph_echo512_close( &ctx.echo, hash1 );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
if ( i == 0 )
|
||||
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, hash0, hash1, size<<3 );
|
||||
hamsi512_2x64_init( &ctx.hamsi );
|
||||
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
|
||||
}
|
||||
hamsi512_2x64_close( &ctx.hamsi, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
#else
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hash0, size );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, size );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in1 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
}
|
||||
#else
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case SHABAL:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512( &ctx.shabal, hash0, size );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512( &ctx.shabal, hash1, size );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
}
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
|
||||
}
|
||||
break;
|
||||
case SHA_512:
|
||||
sha512_2x64_init( &ctx.sha512 );
|
||||
if ( i == 0 )
|
||||
sha512_2x64_update( &ctx.sha512, input, size );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
sha512_2x64_init( &ctx.sha512 );
|
||||
sha512_2x64_update( &ctx.sha512, vhash, size );
|
||||
}
|
||||
sha512_2x64_close( &ctx.sha512, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
}
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
size = 64;
|
||||
}
|
||||
memcpy( output, hash0, 64 );
|
||||
memcpy( output+64, hash1, 64 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int x16r_2x64_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint8_t hash[64*2] __attribute__ ((aligned (64)));
|
||||
if ( !x16r_2x64_hash_generic( hash, input, thrid ) )
|
||||
return 0;
|
||||
|
||||
memcpy( output, hash, 32 );
|
||||
memcpy( output+32, hash+64, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_2x64_prehash( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -13,10 +13,13 @@ __thread x16r_8way_context_overlay x16r_ctx;
|
||||
|
||||
__thread x16r_4way_context_overlay x16r_ctx;
|
||||
|
||||
#elif defined (X16R_2WAY)
|
||||
|
||||
__thread x16r_2x64_context_overlay x16r_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
__thread x16r_context_overlay x16_ctx;
|
||||
|
||||
__thread x16r_context_overlay x16r_ref_ctx;
|
||||
|
||||
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
|
||||
{
|
||||
@@ -58,11 +61,15 @@ bool register_x16r_algo( algo_gate_t* gate )
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#elif defined (X16R_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_2x64;
|
||||
gate->hash = (void*)&x16r_2x64_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -76,11 +83,15 @@ bool register_x16rv2_algo( algo_gate_t* gate )
|
||||
#elif defined (X16RV2_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rv2_4way;
|
||||
gate->hash = (void*)&x16rv2_4way_hash;
|
||||
#elif defined (X16RV2_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rv2_2x64;
|
||||
gate->hash = (void*)&x16rv2_2x64_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16rv2;
|
||||
gate->hash = (void*)&x16rv2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -94,11 +105,15 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#elif defined (X16R_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16r_2x64;
|
||||
gate->hash = (void*)&x16r_2x64_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
//
|
||||
// X16RT
|
||||
|
||||
|
||||
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
|
||||
{
|
||||
int32_t maskedTime = timeStamp & 0xffffff80;
|
||||
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
|
||||
bool register_x16rt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
#if defined (X16RT_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
#elif defined (X16RT_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#elif defined (X16RT_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_2x64;
|
||||
gate->hash = (void*)&x16r_2x64_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_x16rt_veil_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
#if defined (X16RT_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||
gate->hash = (void*)&x16r_8way_hash;
|
||||
#elif defined (X16R_4WAY)
|
||||
#elif defined (X16RT_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||
gate->hash = (void*)&x16r_4way_hash;
|
||||
#elif defined (X16RT_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x16rt_2x64;
|
||||
gate->hash = (void*)&x16r_2x64_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
gate->build_extraheader = (void*)&veil_build_extraheader;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_hex;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 128.0;
|
||||
return true;
|
||||
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_x21s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X16R_8WAY)
|
||||
#if defined (X21S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x21s_8way;
|
||||
gate->hash = (void*)&x21s_8way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
|
||||
#elif defined (X16R_4WAY)
|
||||
#elif defined (X21S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x21s_4way;
|
||||
gate->hash = (void*)&x21s_4way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
|
||||
#elif defined (X21S_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x21s_2x64;
|
||||
gate->hash = (void*)&x21s_2x64_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x21s;
|
||||
gate->hash = (void*)&x21s_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_thread_init;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
|
@@ -7,13 +7,15 @@
|
||||
#include <unistd.h>
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -21,13 +23,13 @@
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
//#if defined (__AVX2__)
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -39,7 +41,7 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#endif
|
||||
//#endif
|
||||
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
@@ -48,28 +50,41 @@
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
// X16R, X16S
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X16R_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X16R_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define X16R_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define X16R_8WAY 1
|
||||
#define X16RV2_8WAY 1
|
||||
#define X16RT_8WAY 1
|
||||
#define X21S_8WAY 1
|
||||
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#define X16RV2_4WAY 1
|
||||
#define X16RT_4WAY 1
|
||||
#define X21S_4WAY 1
|
||||
#define X16R_4WAY 1
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define X16RV2_2WAY 1
|
||||
#endif
|
||||
|
||||
// X16RT, VEIL
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X16RT_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X16RT_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define X16RT_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X21S_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X21S_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define X21S_2WAY 1
|
||||
#endif
|
||||
|
||||
|
||||
enum x16r_Algo {
|
||||
BLAKE = 0,
|
||||
BMW,
|
||||
@@ -167,7 +182,6 @@ union _x16r_4way_context_overlay
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
hashState_luffa luffa1;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -187,34 +201,84 @@ int scanhash_x16r_4way( struct work *, uint32_t,
|
||||
uint64_t *, struct thr_info * );
|
||||
extern __thread x16r_4way_context_overlay x16r_ctx;
|
||||
|
||||
#elif defined(X16R_2WAY)
|
||||
|
||||
union _x16r_2x64_context_overlay
|
||||
{
|
||||
blake512_2x64_context blake;
|
||||
bmw512_2x64_context bmw;
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
skein512_2x64_context skein;
|
||||
jh512_2x64_context jh;
|
||||
keccak512_2x64_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
simd512_context simd;
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
hamsi_2x64_context hamsi;
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_2x64_context sha512;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
|
||||
|
||||
void x16r_2x64_prehash( void *, void * );
|
||||
int x16r_2x64_hash_generic( void *, const void *, int );
|
||||
int x16r_2x64_hash( void *, const void *, int );
|
||||
int scanhash_x16r_2x64( struct work *, uint32_t,
|
||||
uint64_t *, struct thr_info * );
|
||||
extern __thread x16r_2x64_context_overlay x16r_ctx;
|
||||
|
||||
#endif
|
||||
|
||||
// need a reference, add hooks for SSE2.
|
||||
// needed for hex
|
||||
union _x16r_context_overlay
|
||||
{
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
simd512_context simd;
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_hamsi512_context hamsi;
|
||||
#if defined(__AES__)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_sha512_context sha512;
|
||||
@@ -222,7 +286,7 @@ union _x16r_context_overlay
|
||||
|
||||
typedef union _x16r_context_overlay x16r_context_overlay;
|
||||
|
||||
extern __thread x16r_context_overlay x16_ctx;
|
||||
extern __thread x16r_context_overlay x16r_ref_ctx;
|
||||
|
||||
void x16r_prehash( void *, void * );
|
||||
int x16r_hash_generic( void *, const void *, int );
|
||||
@@ -242,6 +306,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(X16RV2_2WAY)
|
||||
|
||||
int x16rv2_2x64_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
int x16rv2_hash( void *state, const void *input, int thr_id );
|
||||
@@ -251,18 +321,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
|
||||
#endif
|
||||
|
||||
// x16rt, veil
|
||||
#if defined(X16R_8WAY)
|
||||
#if defined(X16RT_8WAY)
|
||||
|
||||
//void x16rt_8way_hash( void *state, const void *input );
|
||||
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(X16R_4WAY)
|
||||
#elif defined(X16RT_4WAY)
|
||||
|
||||
//void x16rt_4way_hash( void *state, const void *input );
|
||||
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(X16RT_2WAY)
|
||||
|
||||
//void x16rt_4way_hash( void *state, const void *input );
|
||||
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
//void x16rt_hash( void *state, const void *input );
|
||||
@@ -272,20 +348,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
||||
#endif
|
||||
|
||||
// x21s
|
||||
#if defined(X16R_8WAY)
|
||||
#if defined(X21S_8WAY)
|
||||
|
||||
int x21s_8way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_8way_thread_init();
|
||||
|
||||
#elif defined(X16R_4WAY)
|
||||
#elif defined(X21S_4WAY)
|
||||
|
||||
int x21s_4way_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_4way_thread_init();
|
||||
|
||||
#elif defined(X21S_2WAY)
|
||||
|
||||
int x21s_2x64_hash( void *state, const void *input, int thrid );
|
||||
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool x21s_2x64_thread_init();
|
||||
|
||||
#else
|
||||
|
||||
int x21s_hash( void *state, const void *input, int thr_id );
|
||||
|
@@ -18,32 +18,36 @@ void x16r_prehash( void *edata, void *pdata )
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
sph_jh512_init( &x16_ctx.jh );
|
||||
sph_jh512( &x16_ctx.jh, edata, 64 );
|
||||
sph_jh512_init( &x16r_ref_ctx.jh );
|
||||
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
|
||||
break;
|
||||
case SKEIN:
|
||||
sph_skein512_init( &x16_ctx.skein );
|
||||
sph_skein512( &x16_ctx.skein, edata, 64 );
|
||||
sph_skein512_init( &x16r_ref_ctx.skein );
|
||||
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
sph_keccak512_init( &x16r_ref_ctx.keccak );
|
||||
sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
|
||||
break;
|
||||
case LUFFA:
|
||||
init_luffa( &x16_ctx.luffa, 512 );
|
||||
update_luffa( &x16_ctx.luffa, edata, 64 );
|
||||
init_luffa( &x16r_ref_ctx.luffa, 512 );
|
||||
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
cubehashInit( &x16_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16_ctx.cube, edata, 64 );
|
||||
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
|
||||
break;
|
||||
case HAMSI:
|
||||
sph_hamsi512_init( &x16_ctx.hamsi );
|
||||
sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
|
||||
break;
|
||||
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
|
||||
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
|
||||
break;
|
||||
case SHABAL:
|
||||
sph_shabal512_init( &x16_ctx.shabal );
|
||||
sph_shabal512( &x16_ctx.shabal, edata, 64 );
|
||||
sph_shabal512_init( &x16r_ref_ctx.shabal );
|
||||
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
sph_whirlpool_init( &x16_ctx.whirlpool );
|
||||
sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
|
||||
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
|
||||
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -52,7 +56,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[16];
|
||||
x16r_context_overlay ctx;
|
||||
memcpy( &ctx, &x16_ctx, sizeof(ctx) );
|
||||
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
|
||||
void *in = (void*) input;
|
||||
int size = 80;
|
||||
|
||||
@@ -70,36 +74,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case BMW:
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
sph_bmw512(&ctx.bmw, in, size);
|
||||
sph_bmw512_close(&ctx.bmw, hash);
|
||||
sph_bmw512( &ctx.bmw, in, size );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__)
|
||||
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
groestl512_full( &ctx.groestl, hash, in, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in, size );
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
break;
|
||||
case JH:
|
||||
if ( i == 0 )
|
||||
sph_jh512(&ctx.jh, in+64, 16 );
|
||||
sph_jh512( &ctx.jh, in+64, 16 );
|
||||
else
|
||||
{
|
||||
sph_jh512_init( &ctx.jh );
|
||||
sph_jh512(&ctx.jh, in, size );
|
||||
sph_jh512( &ctx.jh, in, size );
|
||||
}
|
||||
sph_jh512_close(&ctx.jh, hash );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
break;
|
||||
case KECCAK:
|
||||
sph_keccak512_init( &ctx.keccak );
|
||||
sph_keccak512( &ctx.keccak, in, size );
|
||||
if ( i == 0 )
|
||||
sph_keccak512( &ctx.keccak, in+72, 8 );
|
||||
else
|
||||
{
|
||||
sph_keccak512_init( &ctx.keccak );
|
||||
sph_keccak512( &ctx.keccak, in, size );
|
||||
}
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
break;
|
||||
case SKEIN:
|
||||
if ( i == 0 )
|
||||
sph_skein512(&ctx.skein, in+64, 16 );
|
||||
sph_skein512( &ctx.skein, in+64, 16 );
|
||||
else
|
||||
{
|
||||
sph_skein512_init( &ctx.skein );
|
||||
@@ -109,13 +118,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
|
||||
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
|
||||
else
|
||||
luffa_full( &ctx.luffa, hash, 512, in, size );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
|
||||
else
|
||||
cubehash_full( &ctx.cube, hash, 512, in, size );
|
||||
break;
|
||||
@@ -123,19 +132,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
shavite512_full( &ctx.shavite, hash, in, size );
|
||||
break;
|
||||
case SIMD:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
#endif
|
||||
sph_simd512( &ctx.simd, hash, size );
|
||||
sph_simd512_close( &ctx.simd, hash );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, (BitSequence*)hash, 512,
|
||||
(const BitSequence*)in, size );
|
||||
echo_full( &ctx.echo, hash, 512, in, size );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
@@ -144,7 +147,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case HAMSI:
|
||||
if ( i == 0 )
|
||||
sph_hamsi512( &ctx.hamsi, in+64, 16 );
|
||||
sph_hamsi512( &ctx.hamsi, in+72, 8 );
|
||||
else
|
||||
{
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
@@ -153,12 +156,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
sph_hamsi512_close( &ctx.hamsi, hash );
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
fugue512_full( &ctx.fugue, hash, in, size );
|
||||
#else
|
||||
sph_fugue512_full( &ctx.fugue, hash, in, size );
|
||||
#endif
|
||||
break;
|
||||
sph_fugue512_full( &ctx.fugue, hash, in, size );
|
||||
break;
|
||||
case SHABAL:
|
||||
if ( i == 0 )
|
||||
sph_shabal512( &ctx.shabal, in+64, 16 );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined (X16R_8WAY)
|
||||
#if defined (X16RT_8WAY)
|
||||
|
||||
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X16R_4WAY)
|
||||
#elif defined (X16RT_4WAY)
|
||||
|
||||
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X16RT_2WAY)
|
||||
|
||||
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[2*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) timeHash[4*8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
|
||||
if ( s_ntime != masked_ntime )
|
||||
{
|
||||
x16rt_getTimeHash( masked_ntime, &timeHash );
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
x16r_2x64_prehash( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( ( n < last_nonce ) && !(*restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "x16r-gate.h"
|
||||
|
||||
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
|
||||
#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)
|
||||
|
||||
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case HAMSI:
|
||||
if ( i == 0 )
|
||||
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
|
||||
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
|
||||
else
|
||||
{
|
||||
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
|
||||
hash7, vhash );
|
||||
break;
|
||||
case FUGUE:
|
||||
fugue512_full( &ctx.fugue, hash0, in0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, in1, size );
|
||||
fugue512_full( &ctx.fugue, hash2, in2, size );
|
||||
fugue512_full( &ctx.fugue, hash3, in3, size );
|
||||
fugue512_full( &ctx.fugue, hash4, in4, size );
|
||||
fugue512_full( &ctx.fugue, hash5, in5, size );
|
||||
fugue512_full( &ctx.fugue, hash6, in6, size );
|
||||
fugue512_full( &ctx.fugue, hash7, in7, size );
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in1 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in2 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in3 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash3 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in4 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash4 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in5 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash5 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in6 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash6 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in7 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
fugue512_full( &ctx.fugue, hash2, hash2, size );
|
||||
fugue512_full( &ctx.fugue, hash3, hash3, size );
|
||||
fugue512_full( &ctx.fugue, hash4, hash4, size );
|
||||
fugue512_full( &ctx.fugue, hash5, hash5, size );
|
||||
fugue512_full( &ctx.fugue, hash6, hash6, size );
|
||||
fugue512_full( &ctx.fugue, hash7, hash7, size );
|
||||
}
|
||||
break;
|
||||
case SHABAL:
|
||||
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
@@ -588,7 +617,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
if ( !opt_quiet && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
@@ -626,7 +655,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||
case HAMSI:
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
hamsi512_8way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
fugue512_init( &x16rv2_ctx.fugue );
|
||||
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
|
||||
intrlv_8x64( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case SHABAL:
|
||||
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
|
||||
@@ -824,8 +860,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way_update( &ctx.skein, vhash, size );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
}
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
break;
|
||||
case LUFFA:
|
||||
@@ -945,7 +981,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
break;
|
||||
case HAMSI:
|
||||
if ( i == 0 )
|
||||
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
|
||||
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
|
||||
else
|
||||
{
|
||||
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
|
||||
@@ -956,10 +992,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
break;
|
||||
case FUGUE:
|
||||
fugue512_full( &ctx.fugue, hash0, in0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, in1, size );
|
||||
fugue512_full( &ctx.fugue, hash2, in2, size );
|
||||
fugue512_full( &ctx.fugue, hash3, in3, size );
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in1 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in2 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in3 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash3 );
|
||||
}
|
||||
else
|
||||
{
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
fugue512_full( &ctx.fugue, hash2, hash2, size );
|
||||
fugue512_full( &ctx.fugue, hash3, hash3, size );
|
||||
}
|
||||
break;
|
||||
case SHABAL:
|
||||
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
|
||||
@@ -1077,7 +1130,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
if ( !opt_quiet && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
@@ -1101,7 +1154,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
break;
|
||||
case SKEIN:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
|
||||
skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
@@ -1112,7 +1165,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
case HAMSI:
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
hamsi512_4way_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
|
||||
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
fugue512_init( &x16rv2_ctx.fugue );
|
||||
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
|
||||
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
|
||||
break;
|
||||
case SHABAL:
|
||||
v128_bswap32_intrlv80_4x32( vdata32, pdata );
|
||||
@@ -1151,4 +1210,453 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X16RV2_2WAY)
|
||||
|
||||
union _x16rv2_2x64_context_overlay
|
||||
{
|
||||
blake512_2x64_context blake;
|
||||
bmw512_2x64_context bmw;
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
skein512_2x64_context skein;
|
||||
jh512_2x64_context jh;
|
||||
keccak512_2x64_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
simd512_context simd;
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
hamsi_2x64_context hamsi;
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_2x64_context sha512;
|
||||
sph_tiger_context tiger;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
|
||||
|
||||
static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
|
||||
|
||||
// Pad the 24 bytes tiger hash to 64 bytes
|
||||
static inline void padtiger512( uint32_t* hash )
|
||||
{
|
||||
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
|
||||
}
|
||||
|
||||
int x16rv2_2x64_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[20] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (32)));
|
||||
x16rv2_2x64_context_overlay ctx;
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
void *in0 = (void*) hash0;
|
||||
void *in1 = (void*) hash1;
|
||||
int size = 80;
|
||||
|
||||
dintrlv_2x64( hash0, hash1, input, 640 );
|
||||
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
{
|
||||
const char elem = x16r_hash_order[i];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
if ( i == 0 )
|
||||
blake512_2x64_full( &ctx.blake, vhash, input, size );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
|
||||
}
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case BMW:
|
||||
bmw512_2x64_init( &ctx.bmw );
|
||||
if ( i == 0 )
|
||||
bmw512_2x64_update( &ctx.bmw, input, size );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
bmw512_2x64_update( &ctx.bmw, vhash, size );
|
||||
}
|
||||
bmw512_2x64_close( &ctx.bmw, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__)
|
||||
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
|
||||
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in0, size );
|
||||
sph_groestl512_close( &ctx.groestl, hash0 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in1, size );
|
||||
sph_groestl512_close( &ctx.groestl, hash1 );
|
||||
#endif
|
||||
break;
|
||||
case JH:
|
||||
if ( i == 0 )
|
||||
jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
jh512_2x64_init( &ctx.jh );
|
||||
jh512_2x64_update( &ctx.jh, vhash, size );
|
||||
}
|
||||
jh512_2x64_close( &ctx.jh, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case KECCAK:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_tiger( &ctx.tiger, in0 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
sph_tiger( &ctx.tiger, in1 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in0, size );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in1, size );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
for ( int i = (24/4); i < (64/4); i++ )
|
||||
hash0[i] = hash1[i] = 0;
|
||||
|
||||
intrlv_2x64( vhash, hash0, hash1, 512 );
|
||||
keccak512_2x64_init( &ctx.keccak );
|
||||
keccak512_2x64_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_2x64_close( &ctx.keccak, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case SKEIN:
|
||||
if ( i == 0 )
|
||||
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, in0, in1, size<<3 );
|
||||
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
|
||||
}
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
case LUFFA:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_tiger( &ctx.tiger, in0 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
sph_tiger( &ctx.tiger, in1 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in0, size );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in1, size );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
for ( int i = (24/4); i < (64/4); i++ )
|
||||
hash0[i] = hash1[i] = 0;
|
||||
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
|
||||
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
if ( i == 0 )
|
||||
{
|
||||
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
|
||||
}
|
||||
else
|
||||
{
|
||||
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
|
||||
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
shavite512_full( &ctx.shavite, hash0, in0, size );
|
||||
shavite512_full( &ctx.shavite, hash1, in1, size );
|
||||
break;
|
||||
case SIMD:
|
||||
simd512_ctx( &ctx.simd, hash0, in0, size );
|
||||
simd512_ctx( &ctx.simd, hash1, in1, size );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, hash0, 512, in0, size );
|
||||
echo_full( &ctx.echo, hash1, 512, in1, size );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in0, size );
|
||||
sph_echo512_close( &ctx.echo, hash0 );
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in1, size );
|
||||
sph_echo512_close( &ctx.echo, hash1 );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
if ( i == 0 )
|
||||
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
|
||||
else
|
||||
{
|
||||
intrlv_2x64( vhash, hash0, hash1, size<<3 );
|
||||
hamsi512_2x64_init( &ctx.hamsi );
|
||||
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
|
||||
}
|
||||
hamsi512_2x64_close( &ctx.hamsi, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
#else
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hash0, size );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, size );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
|
||||
fugue512_update( &ctx.fugue, in1 + 76, 4 );
|
||||
fugue512_final( &ctx.fugue, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
}
|
||||
#else
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
|
||||
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case SHABAL:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512( &ctx.shabal, hash0, size );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512( &ctx.shabal, hash1, size );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
}
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
|
||||
break;
|
||||
case SHA_512:
|
||||
if ( i == 0 )
|
||||
{
|
||||
sph_tiger( &ctx.tiger, in0 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
|
||||
sph_tiger( &ctx.tiger, in1 + 64, 16 );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in0, size );
|
||||
sph_tiger_close( &ctx.tiger, hash0 );
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger( &ctx.tiger, in1, size );
|
||||
sph_tiger_close( &ctx.tiger, hash1 );
|
||||
}
|
||||
for ( int i = (24/4); i < (64/4); i++ )
|
||||
hash0[i] = hash1[i] = 0;
|
||||
|
||||
intrlv_2x64( vhash, hash0, hash1, 512 );
|
||||
sha512_2x64_init( &ctx.sha512 );
|
||||
sha512_2x64_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_2x64_close( &ctx.sha512, vhash );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
break;
|
||||
}
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
size = 64;
|
||||
}
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[2*16] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[20];
|
||||
uint32_t bedata1[2];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0fff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
const uint32_t ntime = bswap_32(pdata[17]);
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( !opt_quiet && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
// Do midstate prehash on hash functions with block size <= 64 bytes.
|
||||
const char elem = x16r_hash_order[0];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
switch ( algo )
|
||||
{
|
||||
case JH:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
jh512_2x64_init( &x16rv2_ctx.jh );
|
||||
jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
|
||||
break;
|
||||
case KECCAK:
|
||||
case LUFFA:
|
||||
case SHA_512:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_tiger_init( &x16rv2_ctx.tiger );
|
||||
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
case SKEIN:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
case HAMSI:
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
hamsi512_2x64_init( &x16rv2_ctx.hamsi );
|
||||
hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
|
||||
#else
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_hamsi512_init( &x16rv2_ctx.hamsi );
|
||||
sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
#if defined(__AES__)
|
||||
fugue512_init( &x16rv2_ctx.fugue );
|
||||
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
|
||||
#else
|
||||
sph_fugue512_init( &x16rv2_ctx.fugue );
|
||||
sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
|
||||
#endif
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
case SHABAL:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
sph_shabal512_init( &x16rv2_ctx.shabal );
|
||||
sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
|
||||
intrlv_2x64( vdata, edata, edata, 640 );
|
||||
break;
|
||||
default:
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
}
|
||||
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -6,21 +6,15 @@
|
||||
*/
|
||||
#include "x16r-gate.h"
|
||||
|
||||
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
|
||||
#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)
|
||||
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
|
||||
union _x16rv2_context_overlay
|
||||
{
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
sph_skein512_context skein;
|
||||
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
|
||||
sph_bmw512_close(&ctx.bmw, hash);
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)in, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, in, size );
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
break;
|
||||
case SKEIN:
|
||||
sph_skein512_init( &ctx.skein );
|
||||
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
|
||||
shavite512_full( &ctx.shavite, hash, in, size );
|
||||
break;
|
||||
case SIMD:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512(&ctx.simd, hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
#endif
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence*)in, size<<3 );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, in, size );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
|
||||
sph_hamsi512_close( &ctx.hamsi, hash );
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
fugue512_full( &ctx.fugue, hash, in, size );
|
||||
#else
|
||||
sph_fugue512_full( &ctx.fugue, hash, in, size );
|
||||
#endif
|
||||
break;
|
||||
case SHABAL:
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
|
@@ -9,6 +9,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/tiger/sph_tiger.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
@@ -351,4 +352,119 @@ bool x21s_4way_thread_init()
|
||||
return x21s_4way_matrix;
|
||||
}
|
||||
|
||||
#elif defined (X21S_2WAY)
|
||||
|
||||
static __thread uint64_t* x21s_2x64_matrix;
|
||||
|
||||
union _x21s_2x64_context_overlay
|
||||
{
|
||||
sph_haval256_5_context haval;
|
||||
sph_tiger_context tiger;
|
||||
sph_gost512_context gost;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
|
||||
|
||||
int x21s_2x64_hash( void* output, const void* input, int thrid )
|
||||
{
|
||||
uint8_t shash[64*2] __attribute__ ((aligned (64)));
|
||||
x21s_2x64_context_overlay ctx;
|
||||
uint32_t *hash0 = (uint32_t*) shash;
|
||||
uint32_t *hash1 = (uint32_t*)( shash+64 );
|
||||
|
||||
if ( !x16r_2x64_hash_generic( shash, input, thrid ) )
|
||||
return 0;
|
||||
|
||||
sph_haval256_5_init( &ctx.haval );
|
||||
sph_haval256_5( &ctx.haval, hash0, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash0 );
|
||||
sph_haval256_5_init( &ctx.haval );
|
||||
sph_haval256_5( &ctx.haval, hash1, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash1 );
|
||||
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
|
||||
sph_tiger_close( &ctx.tiger, (void*) hash0 );
|
||||
sph_tiger_init( &ctx.tiger );
|
||||
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
|
||||
sph_tiger_close( &ctx.tiger, (void*) hash1 );
|
||||
|
||||
LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
|
||||
(const void*) hash0, 32, 1, 4, 4 );
|
||||
LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
|
||||
(const void*) hash1, 32, 1, 4, 4 );
|
||||
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, (void*) hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, (void*) hash1 );
|
||||
|
||||
sha256_full( output, hash0, 64 );
|
||||
sha256_full( output+32, hash1, 64 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
|
||||
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0cff;
|
||||
|
||||
bedata1[0] = bswap_32( pdata[1] );
|
||||
bedata1[1] = bswap_32( pdata[2] );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_2x64_prehash( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
if ( x21s_2x64_hash( hash, vdata, thr_id ) )
|
||||
for ( int i = 0; i < 2; i++ )
|
||||
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+i );
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool x21s_2x64_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
x21s_2x64_matrix = mm_malloc( size, 64 );
|
||||
return x21s_2x64_matrix;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -15,7 +15,7 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/lyra2/lyra2.h"
|
||||
|
||||
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
|
||||
#if !defined(X21S_8WAY) && !defined(X21S_4WAY)
|
||||
|
||||
static __thread uint64_t* x21s_matrix;
|
||||
|
||||
|
@@ -931,15 +931,19 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
|
||||
// Need sph in some cases
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
//#include "algo/simd/sph_simd.h"
|
||||
//#include "algo/simd/nist.h"
|
||||
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#endif
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/echo/sph_echo.h"
|
||||
//#endif
|
||||
#endif
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
union _x17_context_overlay
|
||||
@@ -967,12 +971,8 @@ union _x17_context_overlay
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__x86_64__)
|
||||
simd512_context simd;
|
||||
#else
|
||||
sph_simd512_context simd;
|
||||
#endif
|
||||
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
hamsi_2x64_context hamsi;
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
@@ -1033,17 +1033,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
|
||||
#if defined(__x86_64__)
|
||||
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||
#else
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512( &ctx.simd, hash0, 64 );
|
||||
sph_simd512_close( &ctx.simd, hash0 );
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512( &ctx.simd, hash1, 64 );
|
||||
sph_simd512_close( &ctx.simd, hash1 );
|
||||
#endif
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
|
||||
@@ -1057,7 +1048,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
|
||||
sph_echo512_close( &ctx.echo, hash1 );
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
intrlv_2x64( vhash, hash0, hash1, 512 );
|
||||
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
@@ -1142,14 +1133,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
|
||||
pdata[19] = bswap_32( n );
|
||||
// pdata[19] = n;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
|
||||
{
|
||||
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);
|
||||
pdata[19] = bswap_32( n+1 );
|
||||
submit_solution( work, hash+8, mythr );
|
||||
}
|
||||
|
@@ -5,24 +5,23 @@
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
@@ -41,12 +40,15 @@ union _x22i_context_overlay
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
@@ -54,11 +56,7 @@ union _x22i_context_overlay
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -84,9 +82,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_bmw512_close(&ctx.bmw, hash);
|
||||
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
groestl512_full( &ctx.groestl, hash, hash, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
|
||||
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
|
||||
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
|
||||
|
||||
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
|
||||
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) hash, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init(&ctx.simd );
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
simd_full( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 512 );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hash, 512, hash, 64 );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
int scanhash_x22i( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
uint32_t edata[20] __attribute__((aligned(32)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
|
@@ -5,24 +5,23 @@
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
@@ -44,12 +43,15 @@ union _x25x_context_overlay
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
@@ -57,11 +59,7 @@ union _x25x_context_overlay
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -89,9 +87,7 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
sph_bmw512_close(&ctx.bmw, &hash[1]);
|
||||
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
|
||||
(const char*)&hash[1], 512 );
|
||||
groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, &hash[1], 64 );
|
||||
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
init_luffa( &ctx.luffa, 512 );
|
||||
luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
|
||||
luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );
|
||||
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
|
||||
cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );
|
||||
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
|
||||
sph_shavite512_close(&ctx.shavite, &hash[8]);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
|
||||
sph_simd512_close(&ctx.simd, &hash[9] );
|
||||
#else
|
||||
update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
|
||||
(const BitSequence *)&hash[8], 512 );
|
||||
#endif
|
||||
simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 );
|
||||
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
|
||||
(const BitSequence*)&hash[9], 512 );
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, &hash[9], 64 );
|
||||
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
int scanhash_x25x( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
uint32_t edata[20] __attribute__((aligned(32)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
edata[19] = n;
|
||||
if ( x25x_hash( hash64, edata, thr_id ) )
|
||||
if ( x25x_hash( hash64, edata, thr_id ) );
|
||||
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.8.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.12.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='23.8'
|
||||
PACKAGE_STRING='cpuminer-opt 23.8'
|
||||
PACKAGE_VERSION='23.12'
|
||||
PACKAGE_STRING='cpuminer-opt 23.12'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 23.8 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 23.12 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 23.8:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 23.12:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 23.8
|
||||
cpuminer-opt configure 23.12
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 23.8, which was
|
||||
It was created by cpuminer-opt $as_me 23.12, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='23.8'
|
||||
VERSION='23.12'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 23.8, which was
|
||||
This file was extended by cpuminer-opt $as_me 23.12, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 23.8
|
||||
cpuminer-opt config.status 23.12
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [23.8])
|
||||
AC_INIT([cpuminer-opt], [23.12])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
4369
configure~
4369
configure~
File diff suppressed because it is too large
Load Diff
@@ -2968,8 +2968,12 @@ static bool cpu_capability( bool display_only )
|
||||
printf(" Linux\n");
|
||||
#elif defined(WIN32)
|
||||
printf(" Windows\n");
|
||||
#elif defined(__APPLE__)
|
||||
printf(" MacOS\n");
|
||||
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||
printf(" Unix\n");
|
||||
#else
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
printf("CPU features: ");
|
||||
|
6
miner.h
6
miner.h
@@ -3,12 +3,16 @@
|
||||
|
||||
#include <cpuminer-config.h>
|
||||
|
||||
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
|
||||
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
|
||||
#elif defined(__aarch64__)
|
||||
#define USER_AGENT_ARCH "arm" // AArch64
|
||||
//#elif
|
||||
// #define USER_AGENT_ARCH "R5" // RISC-V
|
||||
// #define USER_AGENT_ARCH "r5" // RISC-V
|
||||
#else
|
||||
#define USER_AGENT_ARCH
|
||||
#endif
|
||||
|
@@ -411,11 +411,11 @@ static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf );
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
@@ -461,11 +461,11 @@ static inline void v128_bswap32_80( void *d, void *s )
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_m128i( src,0 );
|
||||
v128_t s1 = casti_m128i( src,1 );
|
||||
v128_t s2 = casti_m128i( src,2 );
|
||||
v128_t s3 = casti_m128i( src,3 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -480,38 +480,38 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
|
||||
#else
|
||||
|
||||
s0 = mm128_bswap_32( s0 );
|
||||
s1 = mm128_bswap_32( s1 );
|
||||
s2 = mm128_bswap_32( s2 );
|
||||
s3 = mm128_bswap_32( s3 );
|
||||
s4 = mm128_bswap_32( s4 );
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
#endif
|
||||
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
|
||||
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
|
||||
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
|
||||
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
|
||||
|
||||
casti_m128i( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
|
||||
casti_m128i( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
|
||||
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
|
||||
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
|
||||
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
@@ -797,11 +797,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
const __m256i c1 = v256_32( 0x04050607 );
|
||||
const __m256i c2 = v256_32( 0x08090a0b );
|
||||
const __m256i c3 = v256_32( 0x0c0d0e0f );
|
||||
const v128_t s0 = casti_m128i( src,0 );
|
||||
const v128_t s1 = casti_m128i( src,1 );
|
||||
const v128_t s2 = casti_m128i( src,2 );
|
||||
const v128_t s3 = casti_m128i( src,3 );
|
||||
const v128_t s4 = casti_m128i( src,4 );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
@@ -855,11 +855,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
const __m256i c2 = _mm256_add_epi32( c1, c1 );
|
||||
const __m256i c3 = _mm256_add_epi32( c2, c1 );
|
||||
|
||||
v128_t s0 = casti_m128i( src,0 );
|
||||
v128_t s1 = casti_m128i( src,1 );
|
||||
v128_t s2 = casti_m128i( src,2 );
|
||||
v128_t s3 = casti_m128i( src,3 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -1303,11 +1303,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
const __m512i c1 = v512_32( 0x04050607 );
|
||||
const __m512i c2 = v512_32( 0x08090a0b );
|
||||
const __m512i c3 = v512_32( 0x0c0d0e0f );
|
||||
const v128_t s0 = casti_m128i( src,0 );
|
||||
const v128_t s1 = casti_m128i( src,1 );
|
||||
const v128_t s2 = casti_m128i( src,2 );
|
||||
const v128_t s3 = casti_m128i( src,3 );
|
||||
const v128_t s4 = casti_m128i( src,4 );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
@@ -1360,11 +1360,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
const __m512i c1 = v512_32( 1 );
|
||||
const __m512i c2 = _mm512_add_epi32( c1, c1 );
|
||||
const __m512i c3 = _mm512_add_epi32( c2, c1 );
|
||||
v128_t s0 = casti_m128i( src,0 );
|
||||
v128_t s1 = casti_m128i( src,1 );
|
||||
v128_t s2 = casti_m128i( src,2 );
|
||||
v128_t s3 = casti_m128i( src,3 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -1492,20 +1492,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_m128i( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_m128i( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
@@ -1719,7 +1719,7 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
|
||||
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
|
||||
@@ -1747,11 +1747,11 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
const __m256i c0 = v256_64( 0x0405060700010203 );
|
||||
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
|
||||
const v128_t s0 = casti_m128i( src,0 );
|
||||
const v128_t s1 = casti_m128i( src,1 );
|
||||
const v128_t s2 = casti_m128i( src,2 );
|
||||
const v128_t s3 = casti_m128i( src,3 );
|
||||
const v128_t s4 = casti_m128i( src,4 );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
|
||||
_mm256_castsi128_si256( s0 ) );
|
||||
@@ -1783,7 +1783,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
__m256i s0 = casti_m256i( src,0 );
|
||||
__m256i s1 = casti_m256i( src,1 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -2162,11 +2162,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const __m512i c0 = v512_64( 0x0405060700010203 );
|
||||
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
|
||||
const v128_t s0 = casti_m128i( src,0 );
|
||||
const v128_t s1 = casti_m128i( src,1 );
|
||||
const v128_t s2 = casti_m128i( src,2 );
|
||||
const v128_t s3 = casti_m128i( src,3 );
|
||||
const v128_t s4 = casti_m128i( src,4 );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0,
|
||||
_mm512_castsi128_si512( s0 ) );
|
||||
@@ -2197,11 +2197,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = v512_64( 1 );
|
||||
v128_t s0 = casti_m128i( src,0 );
|
||||
v128_t s1 = casti_m128i( src,1 );
|
||||
v128_t s2 = casti_m128i( src,2 );
|
||||
v128_t s3 = casti_m128i( src,3 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -2391,11 +2391,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const __m512i bswap_shuf = mm512_bcast_m128(
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
const v128_t s0 = casti_m128i( src,0 );
|
||||
const v128_t s1 = casti_m128i( src,1 );
|
||||
const v128_t s2 = casti_m128i( src,2 );
|
||||
const v128_t s3 = casti_m128i( src,3 );
|
||||
const v128_t s4 = casti_m128i( src,4 );
|
||||
const v128_t s0 = casti_v128( src,0 );
|
||||
const v128_t s1 = casti_v128( src,1 );
|
||||
const v128_t s2 = casti_v128( src,2 );
|
||||
const v128_t s3 = casti_v128( src,3 );
|
||||
const v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
|
||||
bswap_shuf );
|
||||
@@ -2415,11 +2415,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
|
||||
{
|
||||
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
v128_t s0 = casti_m128i( src,0 );
|
||||
v128_t s1 = casti_m128i( src,1 );
|
||||
v128_t s2 = casti_m128i( src,2 );
|
||||
v128_t s3 = casti_m128i( src,3 );
|
||||
v128_t s4 = casti_m128i( src,4 );
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
@@ -2489,44 +2489,44 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
|
||||
const v128_t *s = (const v128_t*)src;
|
||||
v128_t *d = (v128_t*)dst;
|
||||
|
||||
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
|
||||
d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
|
||||
d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
|
||||
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
|
||||
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
|
||||
d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
|
||||
d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
|
||||
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
|
||||
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
|
||||
d[ 1] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
|
||||
d[ 2] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
|
||||
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
|
||||
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
|
||||
d[ 5] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
|
||||
d[ 6] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
|
||||
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
|
||||
d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
|
||||
d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 );
|
||||
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
|
||||
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
|
||||
d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd );
|
||||
d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 );
|
||||
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
|
||||
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
|
||||
d[ 9] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
|
||||
d[10] = v128_shuffle2_32( s[10], s[11], 0x88 );
|
||||
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
|
||||
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
|
||||
d[13] = v128_shuffle2_32( s[12], s[13], 0xdd );
|
||||
d[14] = v128_shuffle2_32( s[14], s[15], 0x88 );
|
||||
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
|
||||
d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd );
|
||||
d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 );
|
||||
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
|
||||
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
|
||||
d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd );
|
||||
d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 );
|
||||
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
|
||||
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
|
||||
d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd );
|
||||
d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 );
|
||||
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
|
||||
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
|
||||
d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd );
|
||||
d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 );
|
||||
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
|
||||
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
|
||||
d[17] = v128_shuffle2_32( s[16], s[17], 0xdd );
|
||||
d[18] = v128_shuffle2_32( s[18], s[19], 0x88 );
|
||||
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
|
||||
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
|
||||
d[21] = v128_shuffle2_32( s[20], s[21], 0xdd );
|
||||
d[22] = v128_shuffle2_32( s[22], s[23], 0x88 );
|
||||
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
|
||||
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
|
||||
d[25] = v128_shuffle2_32( s[24], s[25], 0xdd );
|
||||
d[26] = v128_shuffle2_32( s[26], s[27], 0x88 );
|
||||
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
|
||||
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
|
||||
d[29] = v128_shuffle2_32( s[28], s[29], 0xdd );
|
||||
d[30] = v128_shuffle2_32( s[30], s[31], 0x88 );
|
||||
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
@@ -2537,77 +2537,77 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
|
||||
const v128_t *s = (const v128_t*)src;
|
||||
v128_t *d = (v128_t*)dst;
|
||||
|
||||
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
|
||||
d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
|
||||
d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
|
||||
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
|
||||
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
|
||||
d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
|
||||
d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
|
||||
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
|
||||
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
|
||||
d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 );
|
||||
d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
|
||||
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
|
||||
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
|
||||
d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 );
|
||||
d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd );
|
||||
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
|
||||
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
|
||||
d[ 1] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
|
||||
d[ 2] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
|
||||
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
|
||||
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
|
||||
d[ 5] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
|
||||
d[ 6] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
|
||||
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
|
||||
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
|
||||
d[ 9] = v128_shuffle2_32( s[10], s[11], 0x88 );
|
||||
d[10] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
|
||||
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
|
||||
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
|
||||
d[13] = v128_shuffle2_32( s[14], s[15], 0x88 );
|
||||
d[14] = v128_shuffle2_32( s[12], s[13], 0xdd );
|
||||
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
|
||||
d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 );
|
||||
d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd );
|
||||
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
|
||||
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
|
||||
d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 );
|
||||
d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd );
|
||||
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
|
||||
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
|
||||
d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 );
|
||||
d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd );
|
||||
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
|
||||
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
|
||||
d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 );
|
||||
d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd );
|
||||
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
|
||||
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
|
||||
d[17] = v128_shuffle2_32( s[18], s[19], 0x88 );
|
||||
d[18] = v128_shuffle2_32( s[16], s[17], 0xdd );
|
||||
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
|
||||
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
|
||||
d[21] = v128_shuffle2_32( s[22], s[23], 0x88 );
|
||||
d[22] = v128_shuffle2_32( s[20], s[21], 0xdd );
|
||||
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
|
||||
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
|
||||
d[25] = v128_shuffle2_32( s[26], s[27], 0x88 );
|
||||
d[26] = v128_shuffle2_32( s[24], s[25], 0xdd );
|
||||
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
|
||||
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
|
||||
d[29] = v128_shuffle2_32( s[30], s[31], 0x88 );
|
||||
d[30] = v128_shuffle2_32( s[28], s[29], 0xdd );
|
||||
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 );
|
||||
d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 );
|
||||
d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd );
|
||||
d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd );
|
||||
d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 );
|
||||
d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 );
|
||||
d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd );
|
||||
d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd );
|
||||
d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 );
|
||||
d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 );
|
||||
d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd );
|
||||
d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd );
|
||||
d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 );
|
||||
d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 );
|
||||
d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd );
|
||||
d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd );
|
||||
d[32] = v128_shuffle2_32( s[32], s[33], 0x88 );
|
||||
d[33] = v128_shuffle2_32( s[34], s[35], 0x88 );
|
||||
d[34] = v128_shuffle2_32( s[32], s[33], 0xdd );
|
||||
d[35] = v128_shuffle2_32( s[34], s[35], 0xdd );
|
||||
d[36] = v128_shuffle2_32( s[36], s[37], 0x88 );
|
||||
d[37] = v128_shuffle2_32( s[38], s[39], 0x88 );
|
||||
d[38] = v128_shuffle2_32( s[36], s[37], 0xdd );
|
||||
d[39] = v128_shuffle2_32( s[38], s[39], 0xdd );
|
||||
d[40] = v128_shuffle2_32( s[40], s[41], 0x88 );
|
||||
d[41] = v128_shuffle2_32( s[42], s[43], 0x88 );
|
||||
d[42] = v128_shuffle2_32( s[40], s[41], 0xdd );
|
||||
d[43] = v128_shuffle2_32( s[42], s[43], 0xdd );
|
||||
d[44] = v128_shuffle2_32( s[44], s[45], 0x88 );
|
||||
d[45] = v128_shuffle2_32( s[46], s[47], 0x88 );
|
||||
d[46] = v128_shuffle2_32( s[44], s[45], 0xdd );
|
||||
d[47] = v128_shuffle2_32( s[46], s[47], 0xdd );
|
||||
|
||||
d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 );
|
||||
d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 );
|
||||
d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd );
|
||||
d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd );
|
||||
d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 );
|
||||
d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 );
|
||||
d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd );
|
||||
d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd );
|
||||
d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 );
|
||||
d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 );
|
||||
d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd );
|
||||
d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd );
|
||||
d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 );
|
||||
d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 );
|
||||
d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd );
|
||||
d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd );
|
||||
d[48] = v128_shuffle2_32( s[48], s[49], 0x88 );
|
||||
d[49] = v128_shuffle2_32( s[50], s[51], 0x88 );
|
||||
d[50] = v128_shuffle2_32( s[48], s[49], 0xdd );
|
||||
d[51] = v128_shuffle2_32( s[50], s[51], 0xdd );
|
||||
d[52] = v128_shuffle2_32( s[52], s[53], 0x88 );
|
||||
d[53] = v128_shuffle2_32( s[54], s[55], 0x88 );
|
||||
d[54] = v128_shuffle2_32( s[52], s[53], 0xdd );
|
||||
d[55] = v128_shuffle2_32( s[54], s[55], 0xdd );
|
||||
d[56] = v128_shuffle2_32( s[56], s[57], 0x88 );
|
||||
d[57] = v128_shuffle2_32( s[58], s[59], 0x88 );
|
||||
d[58] = v128_shuffle2_32( s[56], s[57], 0xdd );
|
||||
d[59] = v128_shuffle2_32( s[58], s[59], 0xdd );
|
||||
d[60] = v128_shuffle2_32( s[60], s[61], 0x88 );
|
||||
d[61] = v128_shuffle2_32( s[62], s[63], 0x88 );
|
||||
d[62] = v128_shuffle2_32( s[60], s[61], 0xdd );
|
||||
d[63] = v128_shuffle2_32( s[62], s[63], 0xdd );
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
@@ -3248,12 +3248,21 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
|
||||
|
||||
// blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] }
|
||||
#if defined(__SSE4_1__)
|
||||
// No SSE2 implementation.
|
||||
|
||||
//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
|
||||
//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
|
||||
#define v128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
|
||||
#define v128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
|
||||
|
||||
#endif // SSE4_1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
#define v128_intrlv_blend_64( hi, lo ) \
|
||||
v128_blendv( hi, lo, v128_set64( 0ull, 0xffffffffffffffffull ) )
|
||||
|
||||
#define v128_intrlv_blend_32( hi, lo ) \
|
||||
v128_blendv( hi, lo, v128_set64( 0xffffffffull, 0xffffffffull ) )
|
||||
|
||||
#else
|
||||
// unknown, unsupported architecture
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
|
@@ -35,17 +35,17 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// New architecturally agnostic syntax:
|
||||
// All users of 128 bit SIMD should use new syntax or protect SSE2 only
|
||||
// code segments.
|
||||
// Other vector sizes continue with old syntax for now.
|
||||
// Definitionns here will gradually be converted to new synytax.
|
||||
// For consistency the larger vector utilities should do the same.
|
||||
|
||||
//
|
||||
// __m128i -> v128_t
|
||||
// _mm_ -> v128_
|
||||
// mm128_ -> v128_
|
||||
//
|
||||
// There is also new syntax to accomodate ARM's stricter type checking of
|
||||
// vector element size. They have no effect on x86_64.
|
||||
|
||||
// direct translation of native intrinsics
|
||||
|
||||
#define v128_t __m128i
|
||||
// Needed for ARM
|
||||
#define v128u64_t v128_t
|
||||
#define v128u32_t v128_t
|
||||
#define v128u16_t v128_t
|
||||
@@ -56,17 +56,15 @@
|
||||
|
||||
// Needed for ARM, Doesn't do anything special on x86_64
|
||||
#define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) )
|
||||
#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) )
|
||||
#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) )
|
||||
#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) )
|
||||
#define v128_load1_32(p) _mm_set1_epi32( *(uint32_t*)(p) )
|
||||
#define v128_load1_16(p) _mm_set1_epi16( *(uint16_t*)(p) )
|
||||
#define v128_load1_8( p) _mm_set1_epi8( *(uint8_t*) (p) )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 _mm_add_epi64
|
||||
#define v128_add32 _mm_add_epi32
|
||||
#define v128_add16 _mm_add_epi16
|
||||
#define v128_add8 _mm_add_epi8
|
||||
#define v128_add4_64 mm128_add4_64
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
|
||||
#define v128_sub64 _mm_sub_epi64
|
||||
#define v128_sub32 _mm_sub_epi32
|
||||
@@ -82,7 +80,7 @@
|
||||
#define v128_mulw32 _mm_mul_epu32
|
||||
#define v128_mulw16 _mm_mul_epu16
|
||||
|
||||
// compare
|
||||
// signed compare
|
||||
#define v128_cmpeq64 _mm_cmpeq_epi64
|
||||
#define v128_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v128_cmpeq16 _mm_cmpeq_epi16
|
||||
@@ -120,27 +118,6 @@
|
||||
#define v128_xor _mm_xor_si128
|
||||
#define v128_xorq _mm_xor_si128
|
||||
#define v128_andnot _mm_andnot_si128
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_ornot( a, b ) mm128_or( a, mm128_not( b ) )
|
||||
|
||||
// ternary
|
||||
#define v128_xorandnot( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) \
|
||||
_mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
// shift 2 concatenated vectors right
|
||||
#define v128_alignr64 mm128_alignr_64
|
||||
#define v128_alignr32 mm128_alignr_32
|
||||
#if defined(__SSSE3__)
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
#endif
|
||||
|
||||
// unpack
|
||||
#define v128_unpacklo64 _mm_unpacklo_epi64
|
||||
@@ -243,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
|
||||
// Pseudo constants
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
// Bitwise AND, return 1 if result is all bits clear.
|
||||
#define v128_and_eq0 _mm_testz_si128
|
||||
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
|
||||
|
||||
// v128_is_zero?
|
||||
static inline int v128_cmpeq0( v128_t v )
|
||||
{ return v128_and_eq0( v, v ); }
|
||||
|
||||
#endif
|
||||
|
||||
// Bitwise compare return 1 if all bits set.
|
||||
#define v128_cmpeq1 _mm_test_all ones
|
||||
#define v128_cmpeq1(v) _mm_test_all ones(v)
|
||||
|
||||
#define v128_one mm128_mov64_128( 1 )
|
||||
#define m128_one_128 v128_one
|
||||
#define v128_one mm128_mov64_128(1)
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -274,17 +249,14 @@ static inline __m128i v128_neg1_fn()
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
#define m128_neg1_fn v128_neg1_fn
|
||||
#define v128_neg1 v128_neg1_fn()
|
||||
#define m128_neg1 v128_neg1
|
||||
|
||||
//
|
||||
// Vector pointer cast
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
#define castp_v128 castp_m128i
|
||||
#define castp_v128(p) ((__m128i*)(p))
|
||||
#define castp_v128u64 castp_v128
|
||||
#define castp_v128u32 castp_v128
|
||||
#define castp_v128u16 castp_v128
|
||||
@@ -292,8 +264,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
#define cast_v128 cast_m128i
|
||||
#define cast_v128(p) (*((__m128i*)(p)))
|
||||
#define cast_v128u64 cast_v128
|
||||
#define cast_v128u32 cast_v128
|
||||
#define cast_v128u16 cast_v128
|
||||
@@ -301,8 +272,8 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_v128 casti_m128i
|
||||
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_m128i casti_v128 // deprecated
|
||||
#define casti_v128u64 casti_v128
|
||||
#define casti_v128u32 casti_v128
|
||||
#define casti_v128u16 casti_v128
|
||||
@@ -310,7 +281,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
||||
#define casto_v128(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
|
||||
@@ -325,7 +296,7 @@ static inline __m128i v128_neg1_fn()
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
//
|
||||
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
|
||||
// _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
|
||||
//
|
||||
// Fast and powerful but very limited in its application.
|
||||
// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
|
||||
@@ -380,115 +351,112 @@ static inline __m128i v128_neg1_fn()
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
static inline __m128i mm128_not( const __m128i v )
|
||||
static inline __m128i v128_not( const __m128i v )
|
||||
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
#define v128_not( v ) _mm_xor_si128( v, v128_neg1 )
|
||||
|
||||
#endif
|
||||
#define v128_not mm128_not
|
||||
|
||||
static inline __m128i mm128_negate_64( __m128i v )
|
||||
static inline v128u64_t v128_negate_64( v128u64_t v )
|
||||
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate64 mm128_negate_64
|
||||
|
||||
static inline __m128i mm128_negate_32( __m128i v )
|
||||
static inline v128u32_t v128_negate_32( v128u32_t v )
|
||||
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate32 mm128_negate_32
|
||||
|
||||
static inline __m128i mm128_negate_16( __m128i v )
|
||||
static inline v128u16_t v128_negate_16( v128u16_t v )
|
||||
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate16 mm128_negate_16
|
||||
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
#define v128_add4_64( a, b, c, d ) \
|
||||
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
|
||||
|
||||
#define mm128_add4_32( a, b, c, d ) \
|
||||
#define v128_add4_32( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
|
||||
#define mm128_add4_16( a, b, c, d ) \
|
||||
#define v128_add4_16( a, b, c, d ) \
|
||||
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
|
||||
|
||||
#define mm128_add4_8( a, b, c, d ) \
|
||||
#define v128_add4_8( a, b, c, d ) \
|
||||
_mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )
|
||||
|
||||
#define mm128_xor4( a, b, c, d ) \
|
||||
#define v128_xor4( a, b, c, d ) \
|
||||
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
|
||||
|
||||
|
||||
// Memory functions
|
||||
// Mostly for convenience, avoids calculating bytes.
|
||||
// Assumes data is alinged and integral.
|
||||
// n = number of __m128i, bytes/16
|
||||
|
||||
static inline void memset_zero_128( __m128i *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
|
||||
#define v128_memset_zero memset_zero_128
|
||||
static inline void v128_memset_zero( v128_t *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
|
||||
#define memset_zero_128 v128_memset_zero
|
||||
|
||||
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
|
||||
static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
#define v128_memset memset_128
|
||||
|
||||
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
#define v128_memcpy memcpy_128
|
||||
#define memcpy_128 v128_memcpy
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// a & b & c
|
||||
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
#define v128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
#define v128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
#define v128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
#define v128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
#define v128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
|
||||
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
|
||||
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
|
||||
#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
|
||||
|
||||
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
@@ -514,7 +482,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
|
||||
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
|
||||
|
||||
// These sgould never be callled from application code, use rol/ror.
|
||||
// These should never be callled from application code, use rol/ror.
|
||||
#define v128_ror64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
@@ -530,12 +498,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// AVX512 fastest all rotations.
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
#define v128_ror64 _mm_ror_epi64
|
||||
#define v128_rol64 _mm_rol_epi64
|
||||
#define v128_ror32 _mm_ror_epi32
|
||||
#define v128_rol32 _mm_rol_epi32
|
||||
|
||||
// ror/rol will alway find the fastest but these names may fit better with
|
||||
// ror/rol will always find the fastest but these names may fit better with
|
||||
// application code performing shuffles rather than bit rotations.
|
||||
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
|
||||
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
|
||||
@@ -549,7 +517,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_shufll32_16(v) _mm_rol_epi32( v, 16 )
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
// SSE2: fastest 32 bit, very fast 16, fast 8
|
||||
// SSSE3: fastest 32 bit, very fast 16, fast 8
|
||||
|
||||
#define v128_shuflr64_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
@@ -575,7 +543,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
|
||||
@@ -585,7 +553,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll64_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
|
||||
@@ -595,13 +563,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shuflr32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 8 ) ? v128_shufll32_8( v ) \
|
||||
: ( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
|
||||
@@ -610,42 +578,41 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#elif defined(__SSE2__)
|
||||
// SSE2: fastest 32 bit, very fast 16
|
||||
|
||||
#define mm128_ror_64( v, c ) \
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: v128_ror64_sse2( v, c )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
|
||||
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
|
||||
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
: v128_rol64_sse2( v, c )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_ror32_sse2( v, c )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? v128_lrev16( v ) \
|
||||
: v128_rol32_sse2( v, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_ror_64 v128_ror64_sse2
|
||||
#define mm128_rol_64 v128_rol64_sse2
|
||||
#define mm128_ror_32 v128_ror32_sse2
|
||||
#define mm128_rol_32 v128_rol32_sse2
|
||||
#define v128_ror64 v128_ror64_sse2
|
||||
#define v128_rol64 v128_rol64_sse2
|
||||
#define v128_ror32 v128_ror32_sse2
|
||||
#define v128_rol32 v128_rol32_sse2
|
||||
|
||||
#endif
|
||||
|
||||
// Generic names for portable code
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128_rol64 mm128_rol_64
|
||||
#define v128_ror32 mm128_ror_32
|
||||
#define v128_rol32 mm128_rol_32
|
||||
|
||||
//#define v128_ror64 mm128_ror_64
|
||||
//#define v128_rol64 mm128_rol_64
|
||||
//#define v128_ror32 mm128_ror_32
|
||||
#define mm128_rol_32 v128_rol32
|
||||
|
||||
/* not used
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
@@ -653,25 +620,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
_mm_ror_epi64( v1, c )
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
_mm_rol_epi64( v0, c ); \
|
||||
_mm_rol_epi64( v1, c )
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
_mm_ror_epi32( v0, c ); \
|
||||
_mm_ror_epi32( v1, c )
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
#define mm128_2rol32( v1, v0, c ) \
|
||||
_mm_rol_epi32( v0, c ); \
|
||||
_mm_rol_epi32( v1, c )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi64( v1, c ); \
|
||||
@@ -681,7 +648,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_64( v1, v0, c ) \
|
||||
#define v128_2rol64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi64( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi64( v1, c ); \
|
||||
@@ -691,7 +658,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rorx2_32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_srli_epi32( v1, c ); \
|
||||
@@ -701,7 +668,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
v1 = _mm_or_si256( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define mm128_rolx2_32( v1, v0, c ) \
|
||||
#define v128_2rol32( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_slli_epi32( v0, c ); \
|
||||
__m128i t1 = _mm_slli_epi32( v1, c ); \
|
||||
@@ -712,12 +679,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
}
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define v128_2ror64 mm128_rorx2_64
|
||||
#define v128_2rol64 mm128_rolx2_64
|
||||
#define v128_2ror32 mm128_rorx2_32
|
||||
#define v128_2rol32 mm128_rolx2_32
|
||||
|
||||
*/
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
@@ -756,95 +718,76 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
|
||||
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
|
||||
|
||||
|
||||
//TODO fix this
|
||||
// alias bswap
|
||||
//#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
|
||||
//#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
|
||||
//#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
|
||||
|
||||
// reverse bits, can it be done?
|
||||
//#define v128_bitrev8( v ) vrbitq_u8
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Endian byte swap.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_128( v ) \
|
||||
#define v128_bswap128( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
|
||||
0x08090a0b0c0d0e0f ) )
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
#define v128_bswap64( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
|
||||
#define mm128_bswap_32( v ) \
|
||||
#define v128_bswap32( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
#define mm128_bswap_16( v ) \
|
||||
#define v128_bswap16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
|
||||
0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap64_512 mm128_block_bswap_64
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
#define v128_block_bswap64_1024( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
|
||||
#define mm128_block_bswap_32( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
|
||||
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
|
||||
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
|
||||
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
@@ -852,129 +795,127 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
#define mm128_block_bswap32_128( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
|
||||
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
|
||||
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
|
||||
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
|
||||
}
|
||||
|
||||
#define v128_block_bswap32_512( d, s ) \
|
||||
{ \
|
||||
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
|
||||
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
|
||||
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
|
||||
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
|
||||
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
|
||||
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
|
||||
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
|
||||
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
|
||||
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
|
||||
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
|
||||
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static inline __m128i mm128_bswap_64( __m128i v )
|
||||
static inline v128_t v128_bswap64( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm128_bswap_32( __m128i v )
|
||||
static inline v128_t v128_bswap32( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
}
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
static inline __m128i mm128_bswap_16( __m128i v )
|
||||
static inline v128_t v128_bswap16( __m128i v )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
}
|
||||
|
||||
#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
|
||||
|
||||
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_64( s[0] );
|
||||
d[1] = mm128_bswap_64( s[1] );
|
||||
d[2] = mm128_bswap_64( s[2] );
|
||||
d[3] = mm128_bswap_64( s[3] );
|
||||
d[4] = mm128_bswap_64( s[4] );
|
||||
d[5] = mm128_bswap_64( s[5] );
|
||||
d[6] = mm128_bswap_64( s[6] );
|
||||
d[7] = mm128_bswap_64( s[7] );
|
||||
d[0] = v128_bswap64( s[0] );
|
||||
d[1] = v128_bswap64( s[1] );
|
||||
d[2] = v128_bswap64( s[2] );
|
||||
d[3] = v128_bswap64( s[3] );
|
||||
d[4] = v128_bswap64( s[4] );
|
||||
d[5] = v128_bswap64( s[5] );
|
||||
d[6] = v128_bswap64( s[6] );
|
||||
d[7] = v128_bswap64( s[7] );
|
||||
}
|
||||
#define v128_block_bswap64_512 mm128_block_bswap_64
|
||||
|
||||
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_64( s[ 0] );
|
||||
d[ 1] = mm128_bswap_64( s[ 1] );
|
||||
d[ 2] = mm128_bswap_64( s[ 2] );
|
||||
d[ 3] = mm128_bswap_64( s[ 3] );
|
||||
d[ 4] = mm128_bswap_64( s[ 4] );
|
||||
d[ 5] = mm128_bswap_64( s[ 5] );
|
||||
d[ 6] = mm128_bswap_64( s[ 6] );
|
||||
d[ 7] = mm128_bswap_64( s[ 7] );
|
||||
d[ 8] = mm128_bswap_64( s[ 8] );
|
||||
d[ 9] = mm128_bswap_64( s[ 9] );
|
||||
d[10] = mm128_bswap_64( s[10] );
|
||||
d[11] = mm128_bswap_64( s[11] );
|
||||
d[14] = mm128_bswap_64( s[12] );
|
||||
d[13] = mm128_bswap_64( s[13] );
|
||||
d[14] = mm128_bswap_64( s[14] );
|
||||
d[15] = mm128_bswap_64( s[15] );
|
||||
d[ 0] = v128_bswap64( s[ 0] );
|
||||
d[ 1] = v128_bswap64( s[ 1] );
|
||||
d[ 2] = v128_bswap64( s[ 2] );
|
||||
d[ 3] = v128_bswap64( s[ 3] );
|
||||
d[ 4] = v128_bswap64( s[ 4] );
|
||||
d[ 5] = v128_bswap64( s[ 5] );
|
||||
d[ 6] = v128_bswap64( s[ 6] );
|
||||
d[ 7] = v128_bswap64( s[ 7] );
|
||||
d[ 8] = v128_bswap64( s[ 8] );
|
||||
d[ 9] = v128_bswap64( s[ 9] );
|
||||
d[10] = v128_bswap64( s[10] );
|
||||
d[11] = v128_bswap64( s[11] );
|
||||
d[14] = v128_bswap64( s[12] );
|
||||
d[13] = v128_bswap64( s[13] );
|
||||
d[14] = v128_bswap64( s[14] );
|
||||
d[15] = v128_bswap64( s[15] );
|
||||
}
|
||||
|
||||
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_32( s[0] );
|
||||
d[1] = mm128_bswap_32( s[1] );
|
||||
d[2] = mm128_bswap_32( s[2] );
|
||||
d[3] = mm128_bswap_32( s[3] );
|
||||
d[4] = mm128_bswap_32( s[4] );
|
||||
d[5] = mm128_bswap_32( s[5] );
|
||||
d[6] = mm128_bswap_32( s[6] );
|
||||
d[7] = mm128_bswap_32( s[7] );
|
||||
d[0] = v128_bswap32( s[0] );
|
||||
d[1] = v128_bswap32( s[1] );
|
||||
d[2] = v128_bswap32( s[2] );
|
||||
d[3] = v128_bswap32( s[3] );
|
||||
d[4] = v128_bswap32( s[4] );
|
||||
d[5] = v128_bswap32( s[5] );
|
||||
d[6] = v128_bswap32( s[6] );
|
||||
d[7] = v128_bswap32( s[7] );
|
||||
}
|
||||
#define mm128_block_bswap32_256 mm128_block_bswap_32
|
||||
#define v128_block_bswap32_256 mm128_block_bswap_32
|
||||
|
||||
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
{
|
||||
d[ 0] = mm128_bswap_32( s[ 0] );
|
||||
d[ 1] = mm128_bswap_32( s[ 1] );
|
||||
d[ 2] = mm128_bswap_32( s[ 2] );
|
||||
d[ 3] = mm128_bswap_32( s[ 3] );
|
||||
d[ 4] = mm128_bswap_32( s[ 4] );
|
||||
d[ 5] = mm128_bswap_32( s[ 5] );
|
||||
d[ 6] = mm128_bswap_32( s[ 6] );
|
||||
d[ 7] = mm128_bswap_32( s[ 7] );
|
||||
d[ 8] = mm128_bswap_32( s[ 8] );
|
||||
d[ 9] = mm128_bswap_32( s[ 9] );
|
||||
d[10] = mm128_bswap_32( s[10] );
|
||||
d[11] = mm128_bswap_32( s[11] );
|
||||
d[12] = mm128_bswap_32( s[12] );
|
||||
d[13] = mm128_bswap_32( s[13] );
|
||||
d[14] = mm128_bswap_32( s[14] );
|
||||
d[15] = mm128_bswap_32( s[15] );
|
||||
d[ 0] = v128_bswap32( s[ 0] );
|
||||
d[ 1] = v128_bswap32( s[ 1] );
|
||||
d[ 2] = v128_bswap32( s[ 2] );
|
||||
d[ 3] = v128_bswap32( s[ 3] );
|
||||
d[ 4] = v128_bswap32( s[ 4] );
|
||||
d[ 5] = v128_bswap32( s[ 5] );
|
||||
d[ 6] = v128_bswap32( s[ 6] );
|
||||
d[ 7] = v128_bswap32( s[ 7] );
|
||||
d[ 8] = v128_bswap32( s[ 8] );
|
||||
d[ 9] = v128_bswap32( s[ 9] );
|
||||
d[10] = v128_bswap32( s[10] );
|
||||
d[11] = v128_bswap32( s[11] );
|
||||
d[12] = v128_bswap32( s[12] );
|
||||
d[13] = v128_bswap32( s[13] );
|
||||
d[14] = v128_bswap32( s[14] );
|
||||
d[15] = v128_bswap32( s[15] );
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define v128_bswap32 mm128_bswap_32
|
||||
#define v128_bswap64 mm128_bswap_64
|
||||
#define v128_bswap128 mm128_bswap_128
|
||||
#define v128_block_bswap32 mm128_block_bswap_32
|
||||
#define v128_block_bswap64 mm128_block_bswap_64
|
||||
|
||||
@@ -984,24 +925,20 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
#define v128_alignr64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
|
||||
#define v128_alignr32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_alignr_64( hi, lo, c ) \
|
||||
#define v128_alignr64( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
|
||||
|
||||
#define mm128_alignr_32( hi, lo, c ) \
|
||||
#define v128_alignr32( hi, lo, c ) \
|
||||
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
|
||||
|
||||
#endif
|
||||
|
||||
// NEON only uses vector mask. x86 blend selects second arg when control bit
|
||||
// is set. Blendv selects second arg when sign bit is set. And masking is the
|
||||
// opposite, elements are selected from the first arg if the mask bits are set.
|
||||
// Arm blend is a bit by bit blend while x76 is an elenet blend.
|
||||
// Reverse the logic so the use mask is consistent with both formats.
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_blendv _mm_blendv_epi8
|
||||
@@ -1009,7 +946,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
|
||||
#else
|
||||
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -90,7 +90,7 @@ typedef union
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
|
||||
#define m256_one_128 mm256_bcast_m128( v128_one )
|
||||
|
||||
static inline __m256i mm256_neg1_fn()
|
||||
{
|
||||
|
@@ -21,36 +21,36 @@
|
||||
//
|
||||
// vornq( v1, v0 ) or( v1, not( v0 ) )
|
||||
|
||||
#define v128_t uint32x4_t // default,
|
||||
#define v128u64_t uint64x2_t
|
||||
#define v128u32_t uint32x4_t
|
||||
#define v128u16_t uint16x8_t
|
||||
#define v128u8_t uint8x16_t
|
||||
#define v128_t uint32x4_t // default,
|
||||
#define v128u64_t uint64x2_t
|
||||
#define v128u32_t uint32x4_t
|
||||
#define v128u16_t uint16x8_t
|
||||
#define v128u8_t uint8x16_t
|
||||
|
||||
// load & store
|
||||
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
|
||||
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
|
||||
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
|
||||
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
|
||||
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
|
||||
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
|
||||
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
|
||||
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
|
||||
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
|
||||
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
|
||||
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
|
||||
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
|
||||
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
|
||||
|
||||
// load & set1 combined
|
||||
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
|
||||
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
|
||||
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
|
||||
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
|
||||
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
|
||||
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
|
||||
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
|
||||
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 vaddq_u64
|
||||
#define v128_add32 vaddq_u32
|
||||
#define v128_add16 vaddq_u16
|
||||
#define v128_add8 vaddq_u8
|
||||
#define v128_add64 vaddq_u64
|
||||
#define v128_add32 vaddq_u32
|
||||
#define v128_add16 vaddq_u16
|
||||
#define v128_add8 vaddq_u8
|
||||
|
||||
#define v128_add4_64( v3, v2, v1, v0 ) \
|
||||
vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) )
|
||||
@@ -58,15 +58,15 @@
|
||||
#define v128_add4_32( v3, v2, v1, v0 ) \
|
||||
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
|
||||
|
||||
#define v128_sub64 vsubq_u64
|
||||
#define v128_sub32 vsubq_u32
|
||||
#define v128_sub16 vsubq_u16
|
||||
#define v128_sub8 vsubq_u8
|
||||
#define v128_sub64 vsubq_u64
|
||||
#define v128_sub32 vsubq_u32
|
||||
#define v128_sub16 vsubq_u16
|
||||
#define v128_sub8 vsubq_u8
|
||||
|
||||
// returns low half, u64 undocumented, may not exist.
|
||||
#define v128_mul64 vmulq_u64
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
#define v128_mul64 vmulq_u64
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
|
||||
// slow, tested with argon2d
|
||||
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
@@ -76,101 +76,102 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
}
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 vceqq_u64
|
||||
#define v128_cmpeq32 vceqq_u32
|
||||
#define v128_cmpeq16 vceqq_u16
|
||||
#define v128_cmpeq8 vceqq_u8
|
||||
#define v128_cmpeq64 vceqq_u64
|
||||
#define v128_cmpeq32 vceqq_u32
|
||||
#define v128_cmpeq16 vceqq_u16
|
||||
#define v128_cmpeq8 vceqq_u8
|
||||
|
||||
#define v128_iszero vceqzq_u64
|
||||
// v128_cmp0, v128_cmpz, v128 testz
|
||||
#define v128_iszero vceqzq_u64
|
||||
|
||||
// Not yet needed
|
||||
//#define v128_cmpeq1
|
||||
|
||||
#define v128_cmpgt64 vcgtq_u64
|
||||
#define v128_cmpgt32 vcgtq_u32
|
||||
#define v128_cmpgt16 vcgtq_u16
|
||||
#define v128_cmpgt8 vcgtq_u8
|
||||
#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
|
||||
#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
|
||||
#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
|
||||
#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
|
||||
|
||||
#define v128_cmplt64 vcltq_u64
|
||||
#define v128_cmplt32 vcltq_u32
|
||||
#define v128_cmplt16 vcltq_u16
|
||||
#define v128_cmplt8 vcltq_u8
|
||||
#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
|
||||
#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
|
||||
|
||||
// bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
#define v128_sl16 vshlq_n_u16
|
||||
#define v128_sl8 vshlq_n_u8
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
#define v128_sl16 vshlq_n_u16
|
||||
#define v128_sl8 vshlq_n_u8
|
||||
|
||||
#define v128_sr64 vshrq_n_u64
|
||||
#define v128_sr32 vshrq_n_u32
|
||||
#define v128_sr16 vshrq_n_u16
|
||||
#define v128_sr8 vshrq_n_u8
|
||||
#define v128_sr64 vshrq_n_u64
|
||||
#define v128_sr32 vshrq_n_u32
|
||||
#define v128_sr16 vshrq_n_u16
|
||||
#define v128_sr8 vshrq_n_u8
|
||||
|
||||
// Unit tested, working.
|
||||
#define v128_sra64 vshrq_n_s64
|
||||
#define v128_sra32 vshrq_n_s32
|
||||
#define v128_sra16 vshrq_n_s16
|
||||
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c )
|
||||
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c )
|
||||
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c )
|
||||
|
||||
// unary logic
|
||||
#define v128_not vmvnq_u32
|
||||
#define v128_not vmvnq_u32
|
||||
|
||||
// binary logic
|
||||
#define v128_or vorrq_u32
|
||||
#define v128_and vandq_u32
|
||||
#define v128_xor veorq_u32
|
||||
#define v128_or vorrq_u32
|
||||
#define v128_and vandq_u32
|
||||
#define v128_xor veorq_u32
|
||||
|
||||
// ~v1 & v0
|
||||
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
|
||||
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
|
||||
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
|
||||
|
||||
// ~v1 | v0, x86_64 convention, first arg is not'ed
|
||||
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
|
||||
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
|
||||
|
||||
// ternary logic
|
||||
|
||||
// v2 ^ v1 ^ v0
|
||||
// veorq_u32 not defined
|
||||
//#define v128_xor3 veor3q_u32
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
|
||||
// v2 & v1 & v0
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
|
||||
|
||||
// v2 | v1 | v0
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
|
||||
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
|
||||
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
|
||||
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
|
||||
|
||||
// v2 | ( v1 & v0 )
|
||||
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
|
||||
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
|
||||
|
||||
// shift 2 concatenated vectors right.
|
||||
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
|
||||
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
|
||||
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
|
||||
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
|
||||
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
|
||||
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
|
||||
|
||||
// Intetleave high or low half of 2 vectors.
|
||||
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
|
||||
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
|
||||
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
|
||||
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
|
||||
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
|
||||
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
|
||||
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
|
||||
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
|
||||
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
|
||||
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
|
||||
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
|
||||
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
|
||||
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
|
||||
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
|
||||
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
|
||||
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
|
||||
|
||||
|
||||
// AES
|
||||
@@ -184,19 +185,19 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
#define v128_aesenclast( v, k ) \
|
||||
v128_xor( k, vaeseq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesenclast_nokey( v, k ) \
|
||||
#define v128_aesenclast_nokey( v ) \
|
||||
vaeseq_u8( v, v128_zero )
|
||||
|
||||
#define v128_aesdec( v, k ) \
|
||||
v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )
|
||||
|
||||
#define v128_aesdec_nokey( v, k ) \
|
||||
#define v128_aesdec_nokey( v ) \
|
||||
vaesimcq_u8( vaesdq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesdeclast( v, k ) \
|
||||
v128_xor( k, vaesdq_u8( v, v128_zero ) )
|
||||
|
||||
#define v128_aesdeclast_nokey( v, k ) \
|
||||
#define v128_aesdeclast_nokey( v ) \
|
||||
vaesdq_u8( v, v128_zero )
|
||||
|
||||
|
||||
@@ -336,27 +337,27 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
|
||||
// Bit rotation
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
|
||||
#define v128_rol64( v, c ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
|
||||
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
|
||||
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
|
||||
|
||||
#define v128_ror32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
|
||||
#define v128_rol32( v, c ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
|
||||
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
|
||||
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
|
||||
|
||||
#define v128_ror16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
|
||||
#define v128_rol16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
|
||||
|
||||
#define v128_ror8( v, c ) \
|
||||
@@ -433,6 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
|
||||
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
|
||||
|
||||
|
||||
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage is context sensitive.
|
||||
// preferred.
|
||||
|
Reference in New Issue
Block a user