Compare commits

..

4 Commits

Author SHA1 Message Date
Jay D Dee
fc696dbbe5 v23.12 2023-11-20 11:51:57 -05:00
Jay D Dee
f3fde95f27 v23.10 2023-11-15 11:05:41 -05:00
Jay D Dee
0a78013cbe v23.9 2023-11-12 18:48:50 -05:00
Jay D Dee
26b9429589 v23.8 2023-11-11 16:48:57 -05:00
95 changed files with 4410 additions and 15099 deletions

View File

@@ -79,11 +79,6 @@ cpuminer_SOURCES = \
algo/hamsi/hamsi-hash-4way.c \
algo/haval/haval.c \
algo/haval/haval-hash-4way.c \
algo/hodl/aes.c \
algo/hodl/hodl-gate.c \
algo/hodl/hodl-wolf.c \
algo/hodl/sha512_avx.c \
algo/hodl/sha512_avx2.c \
algo/jh/sph_jh.c \
algo/jh/jh-hash-4way.c \
algo/jh/jha-gate.c \
@@ -148,6 +143,8 @@ cpuminer_SOURCES = \
algo/scrypt/scrypt.c \
algo/scrypt/scrypt-core-4way.c \
algo/scrypt/neoscrypt.c \
algo/sha/sha1.c \
algo/sha/sha1-hash.c \
algo/sha/sha256-hash.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
@@ -278,20 +275,10 @@ cpuminer_SOURCES = \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
if ARCH_x86
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
endif
if ARCH_x86_64
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
endif
if ARCH_ARM
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
endif
else
disable_flags += -DNOASM
endif
@@ -301,7 +288,7 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lssl -lcrypto -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

View File

@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
- Arm CPU supporting AArch64 and NEON.
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
are not supported. FreeBSD YMMV.
32 bit CPUs are not supported.
ARM requirements (Beta):
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
CPU: Armv8 and NEON, SHA2 & AES are optional
OS: Linux distribution built for AArch64.
Packages: source code only.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
If a mobile CPU can mine it any CPU can.
See wiki for details.
@@ -73,6 +75,36 @@ If not what makes it happen or not happen?
Change Log
----------
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
v23.11
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
v23.10
x86_64: Fixed scrypt, scryptn2 algos SSE2.
Fixed sha512256d algo AVX2, SSE2, NEON.
Fixed a bug in Skein N-way that reduced performance.
ARM: Skein optimized for NEON, SHA2 & SSE2.
Skein2 algo 2-way optimized for NEON & SSE2.
v23.9
x86_64: fixed minotaurx crash, broken in 23.7.
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
v23.8
Cpuminer-opt is no longer dependant on OpenSSL.
Removed Hodl algo.
Removed legacy Sha256 & Scrypt ASM code.
ARM: Echo AES is working and enabled for x17.
v23.7
Fixed blakes2s, broken in v3.23.4.

View File

@@ -310,7 +310,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;

View File

@@ -99,7 +99,7 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
// AVX10 does not have explicit algo features:

View File

@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else

View File

@@ -475,11 +475,12 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr;
size_t ptr, k;
unsigned bit_len;
uint64_t th, tl;
ptr = sc->ptr;
memcpy( buf, sc->buf, ptr );
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
@@ -519,7 +520,8 @@ void blake512_close( blake512_context *sc, void *dst )
blake512_update( sc, buf, 128 );
}
v128_block_bswap64_512( dst, sc->H );
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,

View File

@@ -131,47 +131,7 @@
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
v128_t *V = (v128_t*)v; \
v128_t V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))

View File

@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
*/
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm128_rol_32( (x), 4), \
mm128_rol_32( (x), 19) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 3) ), \
v128_xor( v128_rol32( (x), 4), \
v128_rol32( (x), 19) ) )
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 8), \
mm128_rol_32( (x), 23) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 8), \
v128_rol32( (x), 23) ) )
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm128_rol_32( (x), 12), \
mm128_rol_32( (x), 25) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 1) ), \
v128_xor( v128_rol32( (x), 12), \
v128_rol32( (x), 25) ) )
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 15), \
mm128_rol_32( (x), 29) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 15), \
v128_rol32( (x), 29) ) )
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
v128_xor( (x), v128_sr32( (x), 1 ) )
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
v128_xor( (x), v128_sr32( (x), 2 ) )
#define rs1(x) mm128_rol_32( x, 3 )
#define rs2(x) mm128_rol_32( x, 7 )
#define rs3(x) mm128_rol_32( x, 13 )
#define rs4(x) mm128_rol_32( x, 16 )
#define rs5(x) mm128_rol_32( x, 19 )
#define rs6(x) mm128_rol_32( x, 23 )
#define rs7(x) mm128_rol_32( x, 27 )
#define rs1(x) v128_rol32( x, 3 )
#define rs2(x) v128_rol32( x, 7 )
#define rs3(x) v128_rol32( x, 13 )
#define rs4(x) v128_rol32( x, 16 )
#define rs5(x) v128_rol32( x, 19 )
#define rs6(x) v128_rol32( x, 23 )
#define rs7(x) v128_rol32( x, 27 )
#define rol_off_32( M, j, off ) \
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
v128_xor( \
v128_add32( \
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
// resulting in some sign changes compared to the reference code.
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_add32( v128_xor( M[13], H[13] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws1 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_xor( M[11], H[11] ) ), \
v128_sub32( v128_xor( M[14], H[14] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws2 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws3 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_sub32( v128_xor( M[10], H[10] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws4 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws5 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws6 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_sub32( v128_xor( M[11], H[11] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_add32( v128_xor( M[12], H[12] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws8 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[13], H[13] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws9 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws10 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws11 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 9], H[ 9] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 9], H[ 9] ) ) )
#define Ws12 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[10], H[10] ) ) )
v128_sub32( \
v128_sub32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[10], H[10] ) ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[11], H[11] ) ) )
v128_add32( \
v128_add32( \
v128_add32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_add32( v128_xor( M[10], H[10] ), \
v128_xor( M[11], H[11] ) ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[12], H[12] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[12], H[12] ) ) )
#define Ws15 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[ 4], H[4] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[13], H[13] ) ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
{
__m128i qt[32], xl, xh; \
v128u64_t qt[32], xl, xh; \
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
qt[10] = v128_add32( ss0( Ws10), H[11] );
qt[11] = v128_add32( ss1( Ws11), H[12] );
qt[12] = v128_add32( ss2( Ws12), H[13] );
qt[13] = v128_add32( ss3( Ws13), H[14] );
qt[14] = v128_add32( ss4( Ws14), H[15] );
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm_xor_si128( xl, _mm_xor_si128(
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = v128_xor( xl, v128_xor(
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
dH[ 0] = v128_add32(
v128_xor( M[0],
v128_xor( v128_sl32( xh, 5 ),
v128_sr32( qt[16], 5 ) ) ),
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
dH[ 1] = v128_add32(
v128_xor( M[1],
v128_xor( v128_sr32( xh, 7 ),
v128_sl32( qt[17], 8 ) ) ),
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
dH[ 2] = v128_add32(
v128_xor( M[2],
v128_xor( v128_sr32( xh, 5 ),
v128_sl32( qt[18], 5 ) ) ),
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
dH[ 3] = v128_add32(
v128_xor( M[3],
v128_xor( v128_sr32( xh, 1 ),
v128_sl32( qt[19], 5 ) ) ),
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
dH[ 4] = v128_add32(
v128_xor( M[4],
v128_xor( v128_sr32( xh, 3 ),
v128_sl32( qt[20], 0 ) ) ),
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
dH[ 5] = v128_add32(
v128_xor( M[5],
v128_xor( v128_sl32( xh, 6 ),
v128_sr32( qt[21], 6 ) ) ),
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
dH[ 6] = v128_add32(
v128_xor( M[6],
v128_xor( v128_sr32( xh, 4 ),
v128_sl32( qt[22], 6 ) ) ),
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
dH[ 7] = v128_add32(
v128_xor( M[7],
v128_xor( v128_sr32( xh, 11 ),
v128_sl32( qt[23], 2 ) ) ),
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
dH[ 8] = v128_add32( v128_add32(
v128_rol32( dH[4], 9 ),
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
v128_xor( v128_sl32( xl, 8 ),
v128_xor( qt[23], qt[ 8] ) ) );
dH[ 9] = v128_add32( v128_add32(
v128_rol32( dH[5], 10 ),
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
v128_xor( v128_sr32( xl, 6 ),
v128_xor( qt[16], qt[ 9] ) ) );
dH[10] = v128_add32( v128_add32(
v128_rol32( dH[6], 11 ),
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
v128_xor( v128_sl32( xl, 6 ),
v128_xor( qt[17], qt[10] ) ) );
dH[11] = v128_add32( v128_add32(
v128_rol32( dH[7], 12 ),
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
v128_xor( v128_sl32( xl, 4 ),
v128_xor( qt[18], qt[11] ) ) );
dH[12] = v128_add32( v128_add32(
v128_rol32( dH[0], 13 ),
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
v128_xor( v128_sr32( xl, 3 ),
v128_xor( qt[19], qt[12] ) ) );
dH[13] = v128_add32( v128_add32(
v128_rol32( dH[1], 14 ),
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
v128_xor( v128_sr32( xl, 4 ),
v128_xor( qt[20], qt[13] ) ) );
dH[14] = v128_add32( v128_add32(
v128_rol32( dH[2], 15 ),
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
v128_xor( v128_sr32( xl, 7 ),
v128_xor( qt[21], qt[14] ) ) );
dH[15] = v128_add32( v128_add32(
v128_rol32( dH[3], 16 ),
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
v128_xor( v128_sr32( xl, 2 ),
v128_xor( qt[22], qt[15] ) ) );
}
static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const __m128i final_s[16] =
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = _mm_set1_epi32( iv[i] );
// sc->H[i] = v128_32( iv[i] );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
v128u64_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
v128_memcpy( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
__m128i *buf;
__m128i h1[16], h2[16], *h;
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = m128_zero;
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = v128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
compress_small( buf, (v128u64_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
casti_v128( dst, u ) = h1[v];
}
/*

View File

@@ -2,12 +2,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
#if defined(BMW512_8WAY)
void bmw512hash_8way(void *state, const void *input)
void bmw512hash_8way( void *state, const void *input )
{
bmw512_8way_context ctx;
bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
const int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
#elif defined(BMW512_4WAY)
//#ifdef BMW512_4WAY
void bmw512hash_4way(void *state, const void *input)
void bmw512hash_4way( void *state, const void *input )
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input )
{
bmw512_2x64_context ctx;
bmw512_2x64_init( &ctx );
bmw512_2x64_update( &ctx, input, 80 );
bmw512_2x64_close( &ctx, state );
}
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
v128_bswap32_intrlv80_2x64( vdata, pdata );
do {
*noncev = v128_intrlv_blend_32( v128_bswap32(
v128_set32( n+1, 0, n, 0 ) ), *noncev );
bmw512hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -2,7 +2,7 @@
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 256.0;
#if defined (BMW512_8WAY)
gate->scanhash = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
#elif defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#elif defined (BMW512_2WAY)
gate->scanhash = (void*)&scanhash_bmw512_2x64;
gate->hash = (void*)&bmw512hash_2x64;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;

View File

@@ -8,19 +8,27 @@
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define BMW512_2WAY 1
#endif
#if defined(BMW512_8WAY)
void bmw512hash_8way( void *state, const void *input );
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input );
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else

View File

@@ -21,112 +21,92 @@
#include "hash_api.h"
#include "simd-utils.h"
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
const uint32_t const1[] __attribute__ ((aligned (32))) =
{ 0x00000001, 0x00000000, 0x00000000, 0x00000000 };
const uint32_t mul2mask[] __attribute__ ((aligned (16))) =
{ 0x00001b00, 0x00000000, 0x00000000, 0x00000000 };
const uint32_t lsbmask[] __attribute__ ((aligned (16))) =
{ 0x01010101, 0x01010101, 0x01010101, 0x01010101 };
const uint32_t invshiftrows[] __attribute__ ((aligned (16))) =
{ 0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c };
#define ECHO_SUBBYTES4( state, j ) \
state[0][j] = v128_aesenc( state[0][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[1][j] = v128_aesenc( state[1][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[2][j] = v128_aesenc( state[2][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[3][j] = v128_aesenc( state[3][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[0][j] = v128_aesenc_nokey( state[0][j] ); \
state[1][j] = v128_aesenc_nokey( state[1][j] ); \
state[2][j] = v128_aesenc_nokey( state[2][j] ); \
state[3][j] = v128_aesenc_nokey( state[3][j] )
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
#define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = v128_aesenc( state[i][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[i][j] = v128_aesenc_nokey( state[i][j] )
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = v128_aesenc(state[0][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[1][j] = v128_aesenc(state[1][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[2][j] = v128_aesenc(state[2][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[3][j] = v128_aesenc(state[3][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
state[3][j] = v128_aesenc(state[3][j], v128_zero )
#define ECHO_SUBBYTES(state, i, j) \
state[i][j] = v128_aesenc(state[i][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
s2 = v128_add8(state1[0][j], state1[0][j]);\
t1 = v128_sr16(state1[0][j], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = s2;\
state2[1][j] = state1[0][j];\
state2[2][j] = state1[0][j];\
state2[3][j] = v128_xor(s2, state1[0][j]);\
s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = v128_xor(state2[1][j], s2);\
state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
state2[2][j] = v128_xor(state2[2][j], s2);\
state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
state2[3][j] = v128_xor(state2[3][j], s2)
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) \
s2 = v128_add8( state1[0][j], state1[0][j] ); \
t1 = v128_sr16( state1[0][j], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = s2; \
state2[1][j] = state1[0][j]; \
state2[2][j] = state1[0][j]; \
state2[3][j] = v128_xor(s2, state1[0][j] ); \
s2 = v128_add8( state1[1][(j + 1) & 3], state1[1][(j + 1) & 3] ); \
t1 = v128_sr16( state1[1][(j + 1) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor3( state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = v128_xor( state2[1][j], s2 ); \
state2[2][j] = v128_xor( state2[2][j], state1[1][(j + 1) & 3] ); \
state2[3][j] = v128_xor( state2[3][j], state1[1][(j + 1) & 3] ); \
s2 = v128_add8( state1[2][(j + 2) & 3], state1[2][(j + 2) & 3] ); \
t1 = v128_sr16( state1[2][(j + 2) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor( state2[0][j], state1[2][(j + 2) & 3] ); \
state2[1][j] = v128_xor3( state2[1][j], s2, state1[2][(j + 2) & 3] ); \
state2[2][j] = v128_xor( state2[2][j], s2 ); \
state2[3][j] = v128_xor( state2[3][j], state1[2][(j + 2) & 3] ); \
s2 = v128_add8( state1[3][(j + 3) & 3], state1[3][(j + 3) & 3] ); \
t1 = v128_sr16( state1[3][(j + 3) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor( state2[0][j], state1[3][(j + 3) & 3] ); \
state2[1][j] = v128_xor( state2[1][j], state1[3][(j + 3) & 3] ); \
state2[2][j] = v128_xor3( state2[2][j], s2, state1[3][(j + 3) & 3] ); \
state2[3][j] = v128_xor( state2[3][j], s2 )
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
{ \
ECHO_SUBBYTES4( _state, 0 ); \
ECHO_SUBBYTES4( _state, 1 ); \
ECHO_SUBBYTES4( _state, 2 ); \
ECHO_SUBBYTES4( _state, 3 ); \
ECHO_MIXBYTES( _state, _state2, 0, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 1, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 2, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 3, t1, t2, s2 ); \
ECHO_SUBBYTES4( _state2, 0 ); \
ECHO_SUBBYTES4( _state2, 1 ); \
ECHO_SUBBYTES4( _state2, 2 ); \
ECHO_SUBBYTES4( _state2, 3 ); \
ECHO_MIXBYTES( _state2, _state, 0, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 1, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 2, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 3, t1, t2, s2 ); \
}
/*
#define ECHO_ROUND_UNROLL2 \
@@ -256,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
}
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
{
int i, j;
@@ -300,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
return SUCCESS;
}
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
HashReturn update_echo( hashState_echo *state, const void *data,
uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -350,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
return SUCCESS;
}
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
HashReturn final_echo( hashState_echo *state, void *hashval)
{
v128_t remainingbits;
@@ -427,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
return SUCCESS;
}
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -550,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
return SUCCESS;
}
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength datalen )
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t datalen )
{
int i, j;
@@ -598,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
data, state->uBlockLength - state->uBufferBytes );
// Process buffer
Compress( state, state->buffer, 1 );
@@ -621,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
if( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
memcpy(state->buffer, data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
@@ -709,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
#if 0
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
HashReturn hRet;
@@ -766,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif
#endif

View File

@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
HashReturn reinit_echo(hashState_echo *state);
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
HashReturn final_echo(hashState_echo *state, void *hashval);
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength databitlen );
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen );
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t databitlen );
#endif // HASH_API_H

View File

@@ -36,7 +36,6 @@
#include "sph_echo.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif
#endif // !AES

View File

@@ -36,8 +36,6 @@
#ifndef SPH_ECHO_H__
#define SPH_ECHO_H__
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
#ifdef __cplusplus
}
#endif
#endif // !AES
#endif

View File

@@ -146,7 +146,7 @@ MYALIGN const unsigned int _IV512[] = {
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
@@ -162,16 +162,16 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
@@ -188,7 +188,7 @@ MYALIGN const unsigned int _IV512[] = {
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
@@ -485,7 +485,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
ctx->uBlockLength = 4;
for(i = 0; i < 6; i++)
ctx->state[i] = m128_zero;
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);

View File

@@ -61,9 +61,45 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
/*
#define TRANSP_MASK \
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
#define SUBSH_MASK0 \
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
#define SUBSH_MASK1 \
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
#define SUBSH_MASK2 \
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
#define SUBSH_MASK3 \
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
#define SUBSH_MASK4 \
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
#define SUBSH_MASK5 \
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
#define SUBSH_MASK6 \
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
#define SUBSH_MASK7 \
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
c07, c06, c05, c04, c03, c02, c01, c00 ) \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
*/
#else

View File

@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -35,8 +35,6 @@
#include "sph_groestl.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif // !AES
#endif

View File

@@ -42,7 +42,6 @@ extern "C"{
#include <stddef.h>
#include "compat/sph_types.h"
#if !defined(__AES__)
/**
* Output size (in bits) for Groestl-224.
*/
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
}
#endif
#endif // !AES
#endif

View File

@@ -35,7 +35,7 @@
#include <stdio.h>
#include "hamsi-hash-4way.h"
static const uint32_t HAMSI_IV512[] =
static const uint32_t HAMSI_IV512[] __attribute__ ((aligned (32))) =
{
0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
@@ -43,7 +43,8 @@ static const uint32_t HAMSI_IV512[] =
0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
};
static const uint32_t alpha_n[] = {
static const uint32_t alpha_n[] __attribute__ ((aligned (32))) =
{
0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
@@ -54,7 +55,8 @@ static const uint32_t alpha_n[] = {
0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
};
static const uint32_t alpha_f[] = {
static const uint32_t alpha_f[] __attribute__ ((aligned (32))) =
{
0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0,
0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
@@ -69,7 +71,8 @@ static const uint32_t alpha_f[] = {
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const uint32_t T512[64][16] = {
static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
{
{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
@@ -2260,4 +2263,4 @@ void hamsi512_2x64( void *dst, const void *data, size_t len )
hamsi512_2x64_close( &sc, dst );
}
#endif // SSE4.1 or NEON
#endif // SSE4.2 or NEON

View File

@@ -38,7 +38,7 @@
#include <stddef.h>
#include "simd-utils.h"
// SSE2 or NEON Hamsi-512 2x64
#if defined(__SSE4_2__) || defined(__ARM_NEON)
typedef struct
{
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
size_t len );
void hamsi512_2x64( void *dst, const void *data, size_t len );
#endif
#if defined (__AVX2__)
// Hamsi-512 4x64

View File

@@ -1,183 +0,0 @@
#include <stdint.h>
#include "miner.h"
#if defined(__AES__)
#include <x86intrin.h>
#include "wolf-aes.h"
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
__m128i tmp4;
*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
tmp4 = _mm_slli_si128(*tmp1, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
}
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
{
__m128i tmp2, tmp4;
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
tmp4 = _mm_slli_si128(*tmp3, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
}
// Special thanks to Intel for helping me
// with ExpandAESKey256() and its subroutines
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
{
__m128i tmp1, tmp2, tmp3;
tmp1 = keys[0] = KeyBuf[0];
tmp3 = keys[1] = KeyBuf[1];
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[2] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[3] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[4] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[5] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[6] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[7] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[8] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[9] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[10] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[11] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[12] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[13] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[14] = tmp1;
}
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AESENC(i,j) \
State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
#define AESENC_N(i) \
AESENC(i,0) \
AESENC(i,1) \
AESENC(i,2) \
AESENC(i,3) \
AESENC(i,4) \
AESENC(i,5) \
AESENC(i,6) \
AESENC(i,7) \
static inline void AES256Core(__m128i* State, __m128i ExpandedKey[][16])
{
const uint32_t N = AES_PARALLEL_N;
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128(State[j], ExpandedKey[j][0]);
}
AESENC_N(1)
AESENC_N(2)
AESENC_N(3)
AESENC_N(4)
AESENC_N(5)
AESENC_N(6)
AESENC_N(7)
AESENC_N(8)
AESENC_N(9)
AESENC_N(10)
AESENC_N(11)
AESENC_N(12)
AESENC_N(13)
for(int j=0; j<N; ++j) {
State[j] = _mm_aesenclast_si128(State[j], ExpandedKey[j][14]);
}
}
void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16], __m128i* IV)
{
const uint32_t N = AES_PARALLEL_N;
__m128i State[N];
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][0], next[j][0]), IV[j]);
}
AES256Core(State, ExpandedKey);
for(int j=0; j<N; ++j) {
data[j][0] = State[j];
}
for(int i = 1; i < BLOCK_COUNT; ++i) {
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][i], next[j][i]), data[j][i - 1]);
}
AES256Core(State, ExpandedKey);
for(int j=0; j<N; ++j) {
data[j][i] = State[j];
}
}
}
#else // NO AVX
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
{
State = _mm_xor_si128(State, ExpandedKey[0]);
for(int i = 1; i < 14; ++i) State = _mm_aesenc_si128(State, ExpandedKey[i]);
return(_mm_aesenclast_si128(State, ExpandedKey[14]));
}
void AES256CBC(__m128i *Ciphertext, const __m128i *Plaintext, const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount)
{
__m128i State = _mm_xor_si128(Plaintext[0], IV);
State = AES256Core(State, ExpandedKey);
Ciphertext[0] = State;
for(int i = 1; i < BlockCount; ++i)
{
State = _mm_xor_si128(Plaintext[i], Ciphertext[i - 1]);
State = AES256Core(State, ExpandedKey);
Ciphertext[i] = State;
}
}
#endif
#endif

View File

@@ -1,75 +0,0 @@
#ifndef HODL_BYTESWAP_H
#define HODL_BYTESWAP_H 1
#define __bswap_constant_16(x) \
((unsigned short int) ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)))
static __inline unsigned short int
__bswap_16 (unsigned short int __bsx)
{
return __bswap_constant_16 (__bsx);
}
// LE
# define htobe16(x) __bswap_16 (x)
# define htole16(x) (x)
# define be16toh(x) __bswap_16 (x)
# define le16toh(x) (x)
// BE
//# define htole16(x) __bswap_16 (x)
//# define htobe16(x) (x)
//# define le16toh(x) __bswap_16 (x)
//# define be16toh(x) (x)
#define __bswap_constant_32(x) \
((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \
(((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
static __inline unsigned int
__bswap_32 (unsigned int __bsx)
{
return __builtin_bswap32 (__bsx);
}
// LE
# define htobe32(x) __bswap_32 (x)
# define htole32(x) (x)
# define be32toh(x) __bswap_32 (x)
# define le32toh(x) (x)
// BE
//# define htole32(x) __bswap_32 (x)
//# define htobe32(x) (x)
//# define le32toh(x) __bswap_32 (x)
//# define be32toh(x) (x)
# define __bswap_constant_64(x) \
((((x) & 0xff00000000000000ull) >> 56) \
| (((x) & 0x00ff000000000000ull) >> 40) \
| (((x) & 0x0000ff0000000000ull) >> 24) \
| (((x) & 0x000000ff00000000ull) >> 8) \
| (((x) & 0x00000000ff000000ull) << 8) \
| (((x) & 0x0000000000ff0000ull) << 24) \
| (((x) & 0x000000000000ff00ull) << 40) \
| (((x) & 0x00000000000000ffull) << 56))
static __inline uint64_t
__bswap_64 (uint64_t __bsx)
{
return __bswap_constant_64 (__bsx);
}
// LE
# define htobe64(x) __bswap_64 (x)
# define htole64(x) (x)
# define be64toh(x) __bswap_64 (x)
# define le64toh(x) (x)
// BE
//# define htole64(x) __bswap_64 (x)
//# define htobe64(x) (x)
//# define le64toh(x) __bswap_64 (x)
//# define be64toh(x) (x)
#endif

View File

@@ -1,185 +0,0 @@
#include <memory.h>
//#include <mm_malloc.h>
#include <stdlib.h>
#include "hodl-gate.h"
#include "hodl-wolf.h"
#define HODL_NSTARTLOC_INDEX 20
#define HODL_NFINALCALC_INDEX 21
static struct work hodl_work;
pthread_barrier_t hodl_barrier;
// All references to this buffer are local to this file, so no args
// need to be passed.
unsigned char *hodl_scratchbuf = NULL;
void hodl_le_build_stratum_request( char* req, struct work* work,
struct stratum_ctx *sctx )
{
uint32_t ntime, nonce, nstartloc, nfinalcalc;
char ntimestr[9], noncestr[9], nstartlocstr[9], nfinalcalcstr[9];
unsigned char *xnonce2str;
le32enc( &ntime, work->data[ algo_gate.ntime_index ] );
le32enc( &nonce, work->data[ algo_gate.nonce_index ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len );
le32enc( &nstartloc, work->data[ HODL_NSTARTLOC_INDEX ] );
le32enc( &nfinalcalc, work->data[ HODL_NFINALCALC_INDEX ] );
bin2hex( nstartlocstr, (char*)(&nstartloc), sizeof(uint32_t) );
bin2hex( nfinalcalcstr, (char*)(&nfinalcalc), sizeof(uint32_t) );
sprintf( req, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr,
nstartlocstr, nfinalcalcstr );
free( xnonce2str );
}
char* hodl_malloc_txs_request( struct work *work )
{
char* req;
json_t *val;
char data_str[2 * sizeof(work->data) + 1];
int i;
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
be32enc( work->data + i, work->data[i] );
bin2hex( data_str, (unsigned char *)work->data, 88 );
if ( work->workid )
{
char *params;
val = json_object();
json_object_set_new( val, "workid", json_string( work->workid ) );
params = json_dumps( val, 0 );
json_decref( val );
req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
data_str, work->txs, params);
free( params );
}
else
{
req = malloc( 128 + 2*88 + strlen(work->txs));
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
data_str, work->txs);
}
return req;
}
void hodl_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_tree,
uint32_t ntime, uint32_t nbits )
{
int i;
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = version;
if ( have_stratum )
for ( i = 0; i < 8; i++ )
g_work->data[ 1+i ] = le32dec( prevhash + i );
else
for (i = 0; i < 8; i++)
g_work->data[ 8-i ] = le32dec( prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[ 9+i ] = be32dec( merkle_tree + i );
g_work->data[ algo_gate.ntime_index ] = ntime;
g_work->data[ algo_gate.nbits_index ] = nbits;
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
}
// called only by thread 0, saves a backup of g_work
void hodl_get_new_work( struct work* work, struct work* g_work)
{
// pthread_rwlock_rdlock( &g_work_lock );
work_free( &hodl_work );
work_copy( &hodl_work, g_work );
hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
// pthread_rwlock_unlock( &g_work_lock );
}
json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
{
json_t *val;
char *req = NULL;
if ( have_gbt )
{
req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
sprintf( req, gbt_lp_req, lp_id );
}
val = json_rpc_call( curl, lp_url, rpc_userpass,
req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
free( req );
return val;
}
// called by every thread, copies the backup to each thread's work.
void hodl_resync_threads( int thr_id, struct work* work )
{
int nonce_index = algo_gate.nonce_index;
pthread_barrier_wait( &hodl_barrier );
if ( memcmp( work->data, hodl_work.data, algo_gate.work_cmp_size ) )
{
work_free( work );
work_copy( work, &hodl_work );
}
work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] );
work_restart[thr_id].restart = 0;
}
bool hodl_do_this_thread( int thr_id )
{
return ( thr_id == 0 );
}
int hodl_scanhash( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
#if defined(__AES__)
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
#endif
return false;
}
bool register_hodl_algo( algo_gate_t* gate )
{
#if !defined(__AES__)
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif
if ( GARBAGE_SIZE % opt_n_threads )
applog( LOG_WARNING,"WARNING: Thread count must be power of 2. Miner may crash or produce invalid hash!" );
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
gate->optimizations = SSE42_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
gate->build_block_header = (void*)&hodl_build_block_header;
gate->resync_threads = (void*)&hodl_resync_threads;
gate->do_this_thread = (void*)&hodl_do_this_thread;
gate->work_cmp_size = 76;
hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
allow_getwork = false;
opt_target_factor = 8388608.0;
return ( hodl_scratchbuf != NULL );
}

View File

@@ -1,6 +0,0 @@
#include "algo-gate-api.h"
extern unsigned char *hodl_scratchbuf;
bool register_hodl_algo ( algo_gate_t* gate );

View File

@@ -1,225 +0,0 @@
#include <string.h>
#include <openssl/evp.h>
#include <openssl/sha.h>
#include "simd-utils.h"
#include "sha512-avx.h"
#include "wolf-aes.h"
#include "hodl-gate.h"
#include "hodl-wolf.h"
#include "miner.h"
#include "algo/sha/sha256d.h"
#if defined(__AES__)
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
void *MidHash )
{
const int Chunk = TOTAL_CHUNKS / ThreadCount;
const uint32_t StartChunk = ThreadID * Chunk;
const uint32_t EndChunk = StartChunk + Chunk;
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
uint64_t* desination[ SHA512_PARALLEL_N ];
for ( int i=0; i < SHA512_PARALLEL_N; ++i )
{
TempBufs[i] = (uint64_t*)malloc( 32 );
memcpy( TempBufs[i], MidHash, 32 );
}
for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
{
for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
{
( (uint32_t*)TempBufs[j] )[0] = i + j;
desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
* GARBAGE_CHUNK_SIZE ) );
}
sha512Compute32b_parallel( TempBufs, desination );
}
for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
free( TempBufs[i] );
#else
uint32_t TempBuf[8];
memcpy( TempBuf, MidHash, 32 );
for ( uint32_t i = StartChunk; i < EndChunk; ++i )
{
TempBuf[0] = i;
SHA512( ( uint8_t *)TempBuf, 32,
( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) );
}
#endif
}
/*
void Rev256(uint32_t *Dest, const uint32_t *Src)
{
for(int i = 0; i < 8; ++i) Dest[i] = swab32(Src[i]);
}
*/
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int threadNumber = mythr->id;
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
CacheEntry Cache[AES_PARALLEL_N] __attribute__ ((aligned (64)));
__m128i* data[AES_PARALLEL_N];
const __m128i* next[AES_PARALLEL_N];
uint32_t CollisionCount = 0;
for ( int n=0; n<AES_PARALLEL_N; ++n )
{
data[n] = Cache[n].dqwords;
}
// Search for pattern in psuedorandom data
int searchNumber = COMPARE_SIZE / opt_n_threads;
int startLoc = threadNumber * searchNumber;
for ( int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k += AES_PARALLEL_N )
{
// copy data to first l2 cache
for ( int n=0; n<AES_PARALLEL_N; ++n )
{
memcpy(Cache[n].dwords, Garbage + k + n, GARBAGE_SLICE_SIZE);
}
for(int j = 0; j < AES_ITERATIONS; ++j)
{
__m128i ExpKey[AES_PARALLEL_N][16];
__m128i ivs[AES_PARALLEL_N];
// use last 4 bytes of first cache as next location
for(int n=0; n<AES_PARALLEL_N; ++n) {
uint32_t nextLocation = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
next[n] = Garbage[nextLocation].dqwords;
__m128i last[2];
last[0] = _mm_xor_si128(Cache[n].dqwords[254], next[n][254]);
last[1] = _mm_xor_si128(Cache[n].dqwords[255], next[n][255]);
// Key is last 32b of Cache
// IV is last 16b of Cache
ExpandAESKey256(ExpKey[n], last);
ivs[n] = last[1];
}
AES256CBC(data, next, ExpKey, ivs);
}
for(int n=0; n<AES_PARALLEL_N; ++n)
if((Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1)) < 1000)
{
uint32_t BlockHdr[22], FinalPoW[8];
swab32_array( BlockHdr, pdata, 20 );
BlockHdr[20] = k + n;
BlockHdr[21] = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 2];
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
CollisionCount++;
if( FinalPoW[7] <= ptarget[7] )
{
pdata[20] = swab32( BlockHdr[20] );
pdata[21] = swab32( BlockHdr[21] );
*hashes_done = CollisionCount;
submit_solution( work, FinalPoW, mythr );
return(0);
}
}
}
*hashes_done = CollisionCount;
return(0);
#else // no AVX
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t BlockHdr[22], FinalPoW[8];
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
CacheEntry Cache;
uint32_t CollisionCount = 0;
int threadNumber = mythr->id;
swab32_array( BlockHdr, pdata, 20 );
// Search for pattern in psuedorandom data
int searchNumber = COMPARE_SIZE / opt_n_threads;
int startLoc = threadNumber * searchNumber;
if ( opt_debug )
applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
{
// copy data to first l2 cache
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
for(int j = 0; j < AES_ITERATIONS; j++)
{
CacheEntry TmpXOR;
__m128i ExpKey[16];
// use last 4 bytes of first cache as next location
uint32_t nextLocation = Cache.dwords[(GARBAGE_SLICE_SIZE >> 2)
- 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
// Copy data from indicated location to second l2 cache -
memcpy(&TmpXOR, Garbage + nextLocation, GARBAGE_SLICE_SIZE);
//XOR location data into second cache
for( int i = 0; i < (GARBAGE_SLICE_SIZE >> 4); ++i )
TmpXOR.dqwords[i] = _mm_xor_si128( Cache.dqwords[i],
TmpXOR.dqwords[i] );
// Key is last 32b of TmpXOR
// IV is last 16b of TmpXOR
ExpandAESKey256( ExpKey, TmpXOR.dqwords +
(GARBAGE_SLICE_SIZE / sizeof(__m128i)) - 2 );
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
- 1 ], 256 ); }
// use last X bits as solution
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
& (COMPARE_SIZE - 1) ) < 1000 )
{
BlockHdr[20] = k;
BlockHdr[21] = Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 2 ];
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
CollisionCount++;
if( FinalPoW[7] <= ptarget[7] )
{
pdata[20] = swab32( BlockHdr[20] );
pdata[21] = swab32( BlockHdr[21] );
*hashes_done = CollisionCount;
submit_solution( work, FinalPoW, mythr );
return(0);
}
}
}
*hashes_done = CollisionCount;
return(0);
#endif // AVX else
}
void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
{
uint32_t BlockHdr[20], MidHash[8];
swab32_array( BlockHdr, pdata, 20 );
sha256d((uint8_t *)MidHash, (uint8_t *)BlockHdr, 80);
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
}
#endif // AES

View File

@@ -1,27 +0,0 @@
#ifndef __HODL_H
#define __HODL_H
#include <stdint.h>
#include "simd-utils.h"
#include "miner.h"
#define AES_ITERATIONS 15
#define GARBAGE_SIZE (1 << 30)
#define GARBAGE_CHUNK_SIZE (1 << 6)
#define GARBAGE_SLICE_SIZE (1 << 12)
#define TOTAL_CHUNKS (1 << 24) // GARBAGE_SIZE / GARBAGE_CHUNK_SIZE
#define COMPARE_SIZE (1 << 18) // GARBAGE_SIZE / GARBAGE_SLICE_SIZE
typedef union _CacheEntry
{
uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
} CacheEntry;
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);
#endif // __HODL_H

View File

@@ -1,208 +0,0 @@
.TH MINERD 1 "March 2016" "cpuminer 2.4.3"
.SH NAME
hodlminer \- CPU miner for Hodlcoin
.SH SYNOPSIS
.B hodlminer
[\fIOPTION\fR]...
.SH DESCRIPTION
.B hodlminer
is a multi-threaded CPU miner for Hodlcoin.
It supports the getwork and getblocktemplate (BIP 22) methods,
as well as the Stratum mining protocol.
.PP
In its normal mode of operation, \fBhodlminer\fR connects to a mining server
(specified with the \fB\-o\fR option), receives work from it and starts hashing.
As soon as a solution is found, it is submitted to the same mining server,
which can accept or reject it.
When using getwork or getblocktemplate,
\fBhodlminer\fR can take advantage of long polling, if the server supports it;
in any case, fresh work is fetched as needed.
When using the Stratum protocol this is not possible,
and the server is responsible for sending fresh work at least every minute;
if it fails to do so,
\fBhodlminer\fR may drop the connection and try reconnecting again.
.PP
By default, \fBhodlminer\fR writes all its messages to standard error.
On systems that have a syslog, the \fB\-\-syslog\fR option can be used
to write to it instead.
.PP
On start, the nice value of all miner threads is set to 19.
On Linux, the scheduling policy is also changed to SCHED_IDLE,
or to SCHED_BATCH if that fails.
On multiprocessor systems, \fBhodlminer\fR
automatically sets the CPU affinity of miner threads
if the number of threads is a multiple of the number of processors.
.SH EXAMPLES
To connect to the Hodlcoin mining pool that provides a Stratum server
at hodl.blockquarry.com on port 8332, authenticating as worker "user.worker" with password "x":
.PP
.nf
.RS
hodlminer \-o stratum+tcp://hodl.blockquarry.com:8332 \-u user.worker -p x -q
.RE
.fi
.PP
To mine to a local Hodlcoin instance running on port 18332,
authenticating with username "rpcuser" and password "rpcpass":
.PP
.nf
.RS
hodlminer \-a hodl \-o http://localhost:18332 \-O rpcuser:rpcpass \\
\-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt
.RE
.fi
.PP
.SH OPTIONS
.TP
\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR
Set the hashing algorithm to use.
Default is hodl.
Possible values are:
.RS 11
.TP 10
.B hodl
.TP
\fB\-\-benchmark\fR
Run in offline benchmark mode.
.TP
\fB\-B\fR, \fB\-\-background\fR
Run in the background as a daemon.
.TP
\fB\-\-cert\fR=\fIFILE\fR
Set an SSL certificate to use with the mining server.
Only supported when using the HTTPS protocol.
.TP
\fB\-\-coinbase\-addr\fR=\fIADDRESS\fR
Set a payout address for solo mining.
This is only used in getblocktemplate mode,
and only if the server does not provide a coinbase transaction.
.TP
\fB\-\-coinbase\-sig\fR=\fITEXT\fR
Set a string to be included in the coinbase (if allowed by the server).
This is only used in getblocktemplate mode.
.TP
\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR
Load options from a configuration file.
\fIFILE\fR must contain a JSON object
mapping long options to their arguments (as strings),
or to \fBtrue\fR if no argument is required.
Sample configuration file:
.nf
{
"url": "stratum+tcp://hodl.blockquarry.com:8332",
"userpass": "foo:bar",
"retry-pause": "10",
"quiet": true
}
.fi
.TP
\fB\-D\fR, \fB\-\-debug\fR
Enable debug output.
.TP
\fB\-h\fR, \fB\-\-help\fR
Print a help message and exit.
.TP
\fB\-\-no\-gbt\fR
Do not use the getblocktemplate RPC method.
.TP
\fB\-\-no\-getwork\fR
Do not use the getwork RPC method.
.TP
\fB\-\-no\-longpoll\fR
Do not use long polling.
.TP
\fB\-\-no\-redirect\fR
Ignore requests from the server to switch to a different URL.
.TP
\fB\-\-no\-stratum\fR
Do not switch to Stratum, even if the server advertises support for it.
.TP
\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR]
Set the URL of the mining server to connect to.
Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR
and \fBstratum+tcps\fR.
If no scheme is specified, http is assumed.
Specifying a \fIPATH\fR is only supported for HTTP and HTTPS.
Specifying credentials has the same effect as using the \fB\-O\fR option.
By default, on HTTP and HTTPS,
the miner tries to use the getblocktemplate RPC method,
and falls back to using getwork if getblocktemplate is unavailable.
This behavior can be modified by using the \fB\-\-no\-gbt\fR
and \fB\-\-no\-getwork\fR options.
.TP
\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR
Set the credentials to use for connecting to the mining server.
Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded.
.TP
\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR
Set the password to use for connecting to the mining server.
Any password previously set with \fB\-O\fR is discarded.
.TP
\fB\-P\fR, \fB\-\-protocol\-dump\fR
Enable output of all protocol-level activities.
.TP
\fB\-q\fR, \fB\-\-quiet\fR
Disable per-thread hashmeter output.
.TP
\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR
Set the maximum number of times to retry if a network call fails.
If not specified, the miner will retry indefinitely.
.TP
\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR
Set how long to wait between retries. Default is 30 seconds.
.TP
\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR
Set an upper bound on the time the miner can go without fetching fresh work.
This setting has no effect in Stratum mode or when long polling is activated.
Default is 5 seconds.
.TP
\fB\-S\fR, \fB\-\-syslog\fR
Log to the syslog facility instead of standard error.
.TP
\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR
Set the number of miner threads.
If not specified, the miner will try to detect the number of available processors
and use that.
.TP
\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR
Set a timeout for long polling.
.TP
\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR
Set the username to use for connecting to the mining server.
Any username previously set with \fB\-O\fR is discarded.
.TP
\fB\-V\fR, \fB\-\-version\fR
Display version information and quit.
.TP
\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR
Connect to the mining server through a proxy.
Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR.
Since libcurl 7.18.0, the following are also supported:
\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving).
If no scheme is specified, the proxy is treated as an HTTP proxy.
.SH ENVIRONMENT
The following environment variables can be specified in lower case or upper case;
the lower-case version has precedence. \fBhttp_proxy\fR is an exception
as it is only available in lower case.
.PP
.RS
.TP
\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTP.
.TP
\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTPS.
.TP
\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use if no protocol-specific proxy is set.
.RE
.PP
Using an environment variable to set the proxy has the same effect as
using the \fB\-x\fR option.
.SH AUTHOR
Most of the code in the current version of minerd was written by
Pooler <pooler@litecoinpool.org> with contributions from others.
The original minerd was written by Jeff Garzik <jeff@garzik.org>.

View File

@@ -1,50 +0,0 @@
#ifndef _SHA512_H
#define _SHA512_H
#include <stdint.h>
#include "simd-utils.h"
//SHA-512 block size
#define SHA512_BLOCK_SIZE 128
//SHA-512 digest size
#define SHA512_DIGEST_SIZE 64
/*
#ifndef __AVX2__
#ifndef __AVX__
#error "Either AVX or AVX2 supported needed"
#endif // __AVX__
#endif // __AVX2__
*/
typedef struct
{
#ifdef __AVX2__
__m256i h[8];
__m256i w[80];
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
v128_t h[8];
v128_t w[80];
#else
int dummy;
#endif
} Sha512Context;
#ifdef __AVX2__
#define SHA512_PARALLEL_N 8
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
#define SHA512_PARALLEL_N 4
#else
#define SHA512_PARALLEL_N 1 // dummy value
#endif
//SHA-512 related functions
void sha512Compute32b_parallel(
uint64_t *data[SHA512_PARALLEL_N],
uint64_t *digest[SHA512_PARALLEL_N]);
void sha512ProcessBlock(Sha512Context contexti[2] );
#endif

View File

@@ -1,235 +0,0 @@
#ifndef __AVX2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
//Dependencies
#include <string.h>
#include <stdlib.h>
#ifdef __FreeBSD__
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "sha512-avx.h"
#if ((defined(_WIN64) || defined(__WINDOWS__)))
#include "hodl-endian.h"
#endif
//SHA-512 auxiliary functions
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
//Rotate right operation
#define ROR64(a, n) _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(a, 64 - n))
//Shift right operation
#define SHR64(a, n) _mm_srli_epi64(a, n)
__m128i mm_htobe_epi64(__m128i a) {
__m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
return _mm_shuffle_epi8(a, mask);
}
__m128i mm_betoh_epi64(__m128i a) {
return mm_htobe_epi64(a);
}
//SHA-512 padding
static const uint8_t padding[128] =
{
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
//SHA-512 constants
static const uint64_t k[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
Sha512Context context[2];
context[0].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
context[0].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
context[0].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
context[0].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
context[0].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
context[0].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
context[0].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
context[0].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
context[1].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
context[1].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
context[1].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
context[1].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
context[1].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
context[1].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
context[1].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
context[1].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
for(int i=0; i<4; ++i) {
context[0].w[i] = _mm_set_epi64x ( data[1][i], data[0][i] );
context[1].w[i] = _mm_set_epi64x ( data[3][i], data[2][i] );
}
for(int i=0; i<10; ++i) {
context[0].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
context[1].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
}
//Length of the original message (before padding)
uint64_t totalSize = 32 * 8;
//Append the length of the original message
context[0].w[14] = _mm_set1_epi64x(0);
context[0].w[15] = _mm_set1_epi64x(htobe64(totalSize));
context[1].w[14] = _mm_set1_epi64x(0);
context[1].w[15] = _mm_set1_epi64x(htobe64(totalSize));
//Calculate the message digest
sha512ProcessBlock(context);
//Convert from host byte order to big-endian byte order
for (int i = 0; i < 8; i++) {
context[0].h[i] = mm_htobe_epi64(context[0].h[i]);
context[1].h[i] = mm_htobe_epi64(context[1].h[i]);
}
//Copy the resulting digest
for(int i=0; i<8; ++i) {
digest[0][i] = _mm_extract_epi64(context[0].h[i], 0);
digest[1][i] = _mm_extract_epi64(context[0].h[i], 1);
digest[2][i] = _mm_extract_epi64(context[1].h[i], 0);
digest[3][i] = _mm_extract_epi64(context[1].h[i], 1);
}
}
#define blk0(n, i) (block[n][i] = mm_betoh_epi64(block[n][i]))
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
SIGMA4(block[n][i - 2]) + block[n][i - 7])
#define ROUND512(a,b,c,d,e,f,g,h) \
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
(d[0]) += T0; \
(d[1]) += T1; \
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
i++
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
T0 = blk0(0, i); \
T1 = blk0(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
T0 = blk(0, i); \
T1 = blk(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define R512_0 \
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
#define R512_16 \
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
#define INIT(x,n) \
x[0] = context[0].h[n]; \
x[1] = context[1].h[n]; \
void sha512ProcessBlock(Sha512Context context[2])
{
__m128i* block[2];
block[0] = context[0].w;
block[1] = context[1].w;
__m128i T0, T1;
__m128i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
INIT(a, 0)
INIT(b, 1)
INIT(c, 2)
INIT(d, 3)
INIT(e, 4)
INIT(f, 5)
INIT(g, 6)
INIT(h, 7)
int i = 0;
R512_0; R512_0;
for(int j=0; j<8; ++j) {
R512_16;
}
context[0].h[0] += a[0];
context[0].h[1] += b[0];
context[0].h[2] += c[0];
context[0].h[3] += d[0];
context[0].h[4] += e[0];
context[0].h[5] += f[0];
context[0].h[6] += g[0];
context[0].h[7] += h[0];
context[1].h[0] += a[1];
context[1].h[1] += b[1];
context[1].h[2] += c[1];
context[1].h[3] += d[1];
context[1].h[4] += e[1];
context[1].h[5] += f[1];
context[1].h[6] += g[1];
context[1].h[7] += h[1];
}
#endif // __AVX__
#endif // __AVX2__

View File

@@ -1,241 +0,0 @@
#ifdef __AVX2__
//Dependencies
#include <string.h>
#include <stdlib.h>
#ifdef __FreeBSD__
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "immintrin.h"
#include "sha512-avx.h"
#if ((defined(_WIN64) || defined(__WINDOWS__)))
#include "hodl-endian.h"
#endif
//SHA-512 auxiliary functions
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
//Rotate right operation
#define ROR64(a, n) _mm256_or_si256(_mm256_srli_epi64(a, n), _mm256_slli_epi64(a, 64 - n))
//Shift right operation
#define SHR64(a, n) _mm256_srli_epi64(a, n)
__m256i mm256_htobe_epi64(__m256i a) {
__m256i mask = _mm256_set_epi8(
24,25,26,27,28,29,30,31,
16,17,18,19,20,21,22,23,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7);
return _mm256_shuffle_epi8(a, mask);
}
__m256i mm256_betoh_epi64(__m256i a) {
return mm256_htobe_epi64(a);
}
//SHA-512 padding
static const uint8_t padding[128] =
{
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
//SHA-512 constants
static const uint64_t k[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
Sha512Context context[2];
context[0].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
context[0].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
context[0].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
context[0].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
context[0].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
context[0].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
context[0].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
context[0].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
context[1].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
context[1].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
context[1].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
context[1].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
context[1].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
context[1].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
context[1].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
context[1].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
for(int i=0; i<4; ++i) {
context[0].w[i] = _mm256_set_epi64x ( data[3][i], data[2][i], data[1][i], data[0][i] );
context[1].w[i] = _mm256_set_epi64x ( data[7][i], data[6][i], data[5][i], data[4][i] );
}
for(int i=0; i<10; ++i) {
context[0].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
context[1].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
}
//Length of the original message (before padding)
uint64_t totalSize = 32 * 8;
//Append the length of the original message
context[0].w[14] = _mm256_set1_epi64x(0);
context[0].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
context[1].w[14] = _mm256_set1_epi64x(0);
context[1].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
//Calculate the message digest
sha512ProcessBlock(context);
//Convert from host byte order to big-endian byte order
for (int i = 0; i < 8; i++) {
context[0].h[i] = mm256_htobe_epi64(context[0].h[i]);
context[1].h[i] = mm256_htobe_epi64(context[1].h[i]);
}
//Copy the resulting digest
for(int i=0; i<8; ++i) {
digest[0][i] = _mm256_extract_epi64(context[0].h[i], 0);
digest[1][i] = _mm256_extract_epi64(context[0].h[i], 1);
digest[2][i] = _mm256_extract_epi64(context[0].h[i], 2);
digest[3][i] = _mm256_extract_epi64(context[0].h[i], 3);
digest[4][i] = _mm256_extract_epi64(context[1].h[i], 0);
digest[5][i] = _mm256_extract_epi64(context[1].h[i], 1);
digest[6][i] = _mm256_extract_epi64(context[1].h[i], 2);
digest[7][i] = _mm256_extract_epi64(context[1].h[i], 3);
}
}
#define blk0(n, i) (block[n][i] = mm256_betoh_epi64(block[n][i]))
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
SIGMA4(block[n][i - 2]) + block[n][i - 7])
#define ROUND512(a,b,c,d,e,f,g,h) \
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
(d[0]) += T0; \
(d[1]) += T1; \
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
i++
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
T0 = blk0(0, i); \
T1 = blk0(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
T0 = blk(0, i); \
T1 = blk(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define R512_0 \
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
#define R512_16 \
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
#define INIT(x,n) \
x[0] = context[0].h[n]; \
x[1] = context[1].h[n]; \
void sha512ProcessBlock(Sha512Context context[2])
{
__m256i* block[2];
block[0] = context[0].w;
block[1] = context[1].w;
__m256i T0, T1;
__m256i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
INIT(a, 0)
INIT(b, 1)
INIT(c, 2)
INIT(d, 3)
INIT(e, 4)
INIT(f, 5)
INIT(g, 6)
INIT(h, 7)
int i = 0;
R512_0; R512_0;
for(int j=0; j<8; ++j) {
R512_16;
}
context[0].h[0] += a[0];
context[0].h[1] += b[0];
context[0].h[2] += c[0];
context[0].h[3] += d[0];
context[0].h[4] += e[0];
context[0].h[5] += f[0];
context[0].h[6] += g[0];
context[0].h[7] += h[0];
context[1].h[0] += a[1];
context[1].h[1] += b[1];
context[1].h[2] += c[1];
context[1].h[3] += d[1];
context[1].h[4] += e[1];
context[1].h[5] += f[1];
context[1].h[6] += g[1];
context[1].h[7] += h[1];
}
#endif // __AVX2__

View File

@@ -1,25 +0,0 @@
#ifndef __WOLF_AES_H
#define __WOLF_AES_H
#include <stdint.h>
#include "simd-utils.h"
void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AES_PARALLEL_N 8
#define BLOCK_COUNT 256
void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
v128_t* IV );
#else
void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
#endif
#endif // __WOLF_AES_H

View File

@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_2WAY)
void keccakhash_2x64(void *state, const void *input)
{
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
keccakhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ))
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
bool register_sha3d_algo( algo_gate_t* gate )
{
hard_coded_eb = 6;
// opt_extranonce = false;
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
#if defined (KECCAK_8WAY)
#if defined (SHA3D_8WAY)
gate->scanhash = (void*)&scanhash_sha3d_8way;
gate->hash = (void*)&sha3d_hash_8way;
#elif defined (KECCAK_4WAY)
#elif defined (SHA3D_4WAY)
gate->scanhash = (void*)&scanhash_sha3d_4way;
gate->hash = (void*)&sha3d_hash_4way;
#elif defined (SHA3D_2WAY)
gate->scanhash = (void*)&scanhash_sha3d_2x64;
gate->hash = (void*)&sha3d_hash_2x64;
#else
gate->scanhash = (void*)&scanhash_sha3d;
gate->hash = (void*)&sha3d_hash;

View File

@@ -8,6 +8,16 @@
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA3D_2WAY 1
#endif
extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
void keccakhash_8way( void *state, const void *input );
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_2WAY)
void keccakhash_2x64( void *state, const void *input );
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void keccakhash( void *state, const void *input );
int scanhash_keccak( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA3D_8WAY)
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64( void *state, const void *input );
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha3d_hash( void *state, const void *input );
int scanhash_sha3d( struct work *work, uint32_t max_nonce,

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)
#if defined(SHA3D_8WAY)
void sha3d_hash_8way(void *state, const void *input)
{
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_4WAY)
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way(void *state, const void *input)
{
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, buffer );
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, buffer, 32 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
sha3d_hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -75,16 +75,16 @@
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
v128_t t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a1 = v128_xnor( a1, a0 ); \
a0 = t; \
}

View File

@@ -1,6 +1,8 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -296,8 +298,14 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
@@ -307,6 +315,6 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif

View File

@@ -4,346 +4,267 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake1, blake2;
sph_bmw512_context bmw1, bmw2, bmw3;
sph_skein512_context skein1, skein2;
sph_jh512_context jh1, jh2;
sph_keccak512_context keccak1, keccak2;
hashState_luffa luffa1, luffa2;
cubehashParam cube;
sph_shavite512_context shavite1, shavite2;
#if defined(__aarch64__)
sph_simd512_context simd1, simd2;
#else
hashState_sd simd1, simd2;
#endif
sph_hamsi512_context hamsi1;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
sph_sha512_context sha1, sha2;
sph_haval256_5_context haval1, haval2;
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
hashState_fugue fugue1, fugue2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
sph_fugue512_context fugue1, fugue2;
#endif
} hmq1725_ctx_holder;
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
void init_hmq1725_ctx()
union _hmq1725_ctx_holder
{
sph_blake512_init(&hmq1725_ctx.blake1);
sph_blake512_init(&hmq1725_ctx.blake2);
sph_bmw512_init(&hmq1725_ctx.bmw1);
sph_bmw512_init(&hmq1725_ctx.bmw2);
sph_bmw512_init(&hmq1725_ctx.bmw3);
sph_skein512_init(&hmq1725_ctx.skein1);
sph_skein512_init(&hmq1725_ctx.skein2);
sph_jh512_init(&hmq1725_ctx.jh1);
sph_jh512_init(&hmq1725_ctx.jh2);
sph_keccak512_init(&hmq1725_ctx.keccak1);
sph_keccak512_init(&hmq1725_ctx.keccak2);
init_luffa( &hmq1725_ctx.luffa1, 512 );
init_luffa( &hmq1725_ctx.luffa2, 512 );
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
sph_shavite512_init(&hmq1725_ctx.shavite1);
sph_shavite512_init(&hmq1725_ctx.shavite2);
#if defined(__aarch64__)
sph_simd512_init(&hmq1725_ctx.simd1);
sph_simd512_init(&hmq1725_ctx.simd2);
#else
init_sd( &hmq1725_ctx.simd1, 512 );
init_sd( &hmq1725_ctx.simd2, 512 );
#endif
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_fugue512_init(&hmq1725_ctx.fugue1);
sph_fugue512_init(&hmq1725_ctx.fugue2);
sph_groestl512_context groestl;
sph_fugue512_context fugue;
#endif
sph_shabal512_init(&hmq1725_ctx.shabal1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
sph_sha512_init( &hmq1725_ctx.sha1 );
sph_sha512_init( &hmq1725_ctx.sha2 );
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
sph_echo512_context echo;
#endif
}
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha;
sph_haval256_5_context haval;
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
void hmq_bmw512_midstate( const void* input )
{
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
sph_bmw512( &hmq_bmw_mid, input, 64 );
}
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
uint32_t hashA[32] __attribute__((aligned(32)));
uint32_t hashB[32] __attribute__((aligned(32)));
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hashA ); //1
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
sph_groestl512_close( &ctx.groestl, hashA ); //2
#endif
}
else
{
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
sph_skein512_close(&h_ctx.skein1, hashA); //2
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hashB, 64 ); //1
sph_skein512_close( &ctx.skein, hashA ); //2
}
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
sph_jh512_close(&h_ctx.jh1, hashB); //4
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashA, 64 ); //3
sph_jh512_close( &ctx.jh, hashB ); //4
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
sph_keccak512_close( &ctx.keccak, hashA ); //3
if ( hashA[0] & mask ) //4
{
sph_blake512 (&h_ctx.blake1, hashA, 64); //
sph_blake512_close(&h_ctx.blake1, hashB); //5
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
}
else
{
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
sph_bmw512_close( &ctx.bmw, hashB ); //5
}
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
if ( hashB[0] & mask ) //7
{
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //
sph_keccak512_close( &ctx.keccak, hashA ); //8
}
else
{
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
sph_jh512_close(&h_ctx.jh2, hashA); //8
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashB, 64 ); //7
sph_jh512_close( &ctx.jh, hashA ); //8
}
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
sph_shavite512_close( &ctx.shavite, hashB ); //4
#if defined(__aarch64__)
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
sph_simd512_close(&h_ctx.simd1, hashA); //4
#else
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#endif
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
if ( hashA[0] & mask ) //4
{
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
else
{
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset(&hashB[8], 0, 32);
}
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashB, 64 ); //5
sph_echo512_close( &ctx.echo, hashA ); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
sph_blake512_close(&h_ctx.blake2, hashB); //7
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
if ( hashB[0] & mask ) //7
{
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashB, 64 ); //
sph_shavite512_close( &ctx.shavite, hashA ); //8
}
else
{
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
}
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
fugue512_Final( &h_ctx.fugue1, hashA ); //3
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
sph_fugue512_close( &ctx.fugue, hashA ); //3
#endif
if ( hashA[0] & mask ) //4
{
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashA, 64 ); //
sph_echo512_close( &ctx.echo, hashB ); //5
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&h_ctx.simd2, hashA, 64); //6
sph_simd512_close(&h_ctx.simd2, hashB); //7
#else
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
sph_shabal512_close( &ctx.shabal, hashA ); //6
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
fugue512_Final( &h_ctx.fugue2, hashA ); //8
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //
sph_fugue512_close( &ctx.fugue, hashA ); //8
#endif
}
else
{
sph_sha512( &h_ctx.sha1, hashB, 64 );
sph_sha512_close( &h_ctx.sha1, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
sph_groestl512_close( &ctx.groestl, hashB ); //4
#endif
sph_sha512( &h_ctx.sha2, hashB, 64 );
sph_sha512_close( &h_ctx.sha2, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
if ( hashA[0] & mask ) //4
{
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
memset(&hashB[8], 0, 32);
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset( &hashB[8], 0, 32 );
}
else
{
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
sph_bmw512_close( &ctx.bmw, hashA ); //6
memcpy(state, hashA, 32);
memcpy( state, hashA, 32 );
}
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
@@ -356,7 +277,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
// hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {

View File

@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \

390
algo/sha/sha1-hash.c Normal file
View File

@@ -0,0 +1,390 @@
#include "simd-utils.h"
#include <stdint.h>
#include "sha1-hash.h"
#if defined(__x86_64__) && defined(__SHA__)
#define sha1_opt_rounds( state_out, data, state_in ) \
{ \
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; \
__m128i MSG0, MSG1, MSG2, MSG3; \
\
ABCD = _mm_load_si128( (const __m128i*) state_in ); \
E0 = _mm_set_epi32( state_in[4], 0, 0, 0 ); \
ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
\
ABCD_SAVE = ABCD; \
E0_SAVE = E0; \
\
/* Rounds 0-3 */ \
MSG0 = load_msg( data, 0 ); \
E0 = _mm_add_epi32( E0, MSG0 ); \
E1 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
\
/* Rounds 4-7 */ \
MSG1 = load_msg( data, 1 ); \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
\
/* Rounds 8-11 */ \
MSG2 = load_msg( data, 2 ); \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 12-15 */ \
MSG3 = load_msg( data, 3 ); \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 16-19 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 20-23 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 24-27 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 28-31 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 32-35 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 36-39 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 40-43 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 44-47 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 48-51 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 52-55 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 56-59 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 60-63 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 64-67 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 68-71 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 72-75 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
\
/* Rounds 76-79 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
\
/* Combine state */ \
E0 = _mm_sha1nexte_epu32( E0, E0_SAVE ); \
ABCD = _mm_add_epi32( ABCD, ABCD_SAVE ); \
\
/* Save state */ \
ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
_mm_store_si128( (__m128i*) state_out, ABCD ); \
state_out[4] = _mm_extract_epi32( E0, 3 ); \
}
void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
sha1_opt_rounds( state_out, input, state_in );
#undef load_msg
}
void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
const __m128i MASK = _mm_set_epi64x( 0x0001020304050607ULL,
0x08090a0b0c0d0e0fULL );
#define load_msg( m, i ) _mm_shuffle_epi8( casti_v128( m, i ), MASK )
sha1_opt_rounds( state_out, input, state_in );
#undef load_msg
}
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \
uint32x4_t ABCD, ABCD_SAVED; \
uint32x4_t TMP0, TMP1; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32_t E0, E0_SAVED, E1; \
\
/* Load state */ \
ABCD = vld1q_u32( &state_in[0] ); \
E0 = state_in[4]; \
\
/* Save state */ \
ABCD_SAVED = ABCD; \
E0_SAVED = E0; \
\
MSG0 = load_msg( data, 0 ); \
MSG1 = load_msg( data, 1 ); \
MSG2 = load_msg( data, 2 ); \
MSG3 = load_msg( data, 3 ); \
\
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x5A827999 ) ); \
\
/* Rounds 0-3 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x5A827999 ) ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 4-7 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32(ABCD, E1, TMP1); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x5A827999 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 8-11 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 12-15 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 16-19 */\
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 20-23 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 24-27 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 28-31 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 32-35 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 36-39 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 40-43 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 44-47 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 48-51 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 52-55 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 56-59 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 60-63 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 64-67 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32(MSG2, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 68-71 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
\
/* Rounds 72-75 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
\
/* Rounds 76-79 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
\
/* Combine state */ \
E0 += E0_SAVED; \
ABCD = vaddq_u32( ABCD_SAVED, ABCD ); \
\
/* Save state */ \
vst1q_u32( &state_out[0], ABCD ); \
state_out[4] = E0; \
}
void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) );
sha1_neon_rounds( state_out, input, state_in );
#undef load_msg
}
void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i );
sha1_neon_rounds( state_out, input, state_in );
#undef load_msg
}
#endif

40
algo/sha/sha1-hash.h Normal file
View File

@@ -0,0 +1,40 @@
#ifndef SHA1_HASH_H__
#define SHA1_HASH_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#include "cpuminer-config.h"
#include "sph_sha1.h"
// SHA hooks for sha1, automaticaaly substituded in SPH
#if defined(__x86_64__) && defined(__SHA__)
void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
#define sha1_transform_le sha1_x86_sha_transform_le
#define sha1_transform_be sha1_x86_sha_transform_be
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
#define sha1_transform_le sha1_neon_sha_transform_le
#define sha1_transform_be sha1_neon_sha_transform_be
#else
#define sha1_transform_le sph_sha1_transform_le
#define sha1_transform_be sph_sha1_transform_be
#endif
#define sha1_full sph_sha1_full
#endif

400
algo/sha/sha1.c Normal file
View File

@@ -0,0 +1,400 @@
/* $Id: sha1.c 216 2010-06-08 09:46:57Z tp $ */
/*
* SHA-1 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "simd-utils.h"
#include "sha1-hash.h"
#define F(B, C, D) ((((C) ^ (D)) & (B)) ^ (D))
#define G(B, C, D) ((B) ^ (C) ^ (D))
#define H(B, C, D) (((D) & (C)) | (((D) | (C)) & (B)))
#define I(B, C, D) G(B, C, D)
#define ROTL rol32
//#define ROTL SPH_ROTL32
#define K1 SPH_C32(0x5A827999)
#define K2 SPH_C32(0x6ED9EBA1)
#define K3 SPH_C32(0x8F1BBCDC)
#define K4 SPH_C32(0xCA62C1D6)
static const sph_u32 IV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
SPH_C32(0x98BADCFE), SPH_C32(0x10325476),
SPH_C32(0xC3D2E1F0)
};
/*
* This macro defines the body for a SHA-1 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "r" parameter should evaluate to
* an array or pointer expression designating the array of 5 words which
* contains the input and output of the compression function.
*/
#define SHA1_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E; \
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
\
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
\
W00 = in(0); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W00 + K1); \
B = ROTL(B, 30); \
W01 = in(1); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W01 + K1); \
A = ROTL(A, 30); \
W02 = in(2); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W02 + K1); \
E = ROTL(E, 30); \
W03 = in(3); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W03 + K1); \
D = ROTL(D, 30); \
W04 = in(4); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W04 + K1); \
C = ROTL(C, 30); \
W05 = in(5); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W05 + K1); \
B = ROTL(B, 30); \
W06 = in(6); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W06 + K1); \
A = ROTL(A, 30); \
W07 = in(7); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W07 + K1); \
E = ROTL(E, 30); \
W08 = in(8); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W08 + K1); \
D = ROTL(D, 30); \
W09 = in(9); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W09 + K1); \
C = ROTL(C, 30); \
W10 = in(10); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W10 + K1); \
B = ROTL(B, 30); \
W11 = in(11); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W11 + K1); \
A = ROTL(A, 30); \
W12 = in(12); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W12 + K1); \
E = ROTL(E, 30); \
W13 = in(13); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W13 + K1); \
D = ROTL(D, 30); \
W14 = in(14); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W14 + K1); \
C = ROTL(C, 30); \
W15 = in(15); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W15 + K1); \
B = ROTL(B, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W00 + K1); \
A = ROTL(A, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W01 + K1); \
E = ROTL(E, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W02 + K1); \
D = ROTL(D, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W03 + K1); \
C = ROTL(C, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W04 + K2); \
B = ROTL(B, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W05 + K2); \
A = ROTL(A, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W06 + K2); \
E = ROTL(E, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W07 + K2); \
D = ROTL(D, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W08 + K2); \
C = ROTL(C, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W09 + K2); \
B = ROTL(B, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W10 + K2); \
A = ROTL(A, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W11 + K2); \
E = ROTL(E, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W12 + K2); \
D = ROTL(D, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W13 + K2); \
C = ROTL(C, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W14 + K2); \
B = ROTL(B, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W15 + K2); \
A = ROTL(A, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W00 + K2); \
E = ROTL(E, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W01 + K2); \
D = ROTL(D, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W02 + K2); \
C = ROTL(C, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W03 + K2); \
B = ROTL(B, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W04 + K2); \
A = ROTL(A, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W05 + K2); \
E = ROTL(E, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W06 + K2); \
D = ROTL(D, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W07 + K2); \
C = ROTL(C, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W08 + K3); \
B = ROTL(B, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W09 + K3); \
A = ROTL(A, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W10 + K3); \
E = ROTL(E, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W11 + K3); \
D = ROTL(D, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W12 + K3); \
C = ROTL(C, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W13 + K3); \
B = ROTL(B, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W14 + K3); \
A = ROTL(A, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W15 + K3); \
E = ROTL(E, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W00 + K3); \
D = ROTL(D, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W01 + K3); \
C = ROTL(C, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W02 + K3); \
B = ROTL(B, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W03 + K3); \
A = ROTL(A, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W04 + K3); \
E = ROTL(E, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W05 + K3); \
D = ROTL(D, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W06 + K3); \
C = ROTL(C, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W07 + K3); \
B = ROTL(B, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W08 + K3); \
A = ROTL(A, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W09 + K3); \
E = ROTL(E, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W10 + K3); \
D = ROTL(D, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W11 + K3); \
C = ROTL(C, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W12 + K4); \
B = ROTL(B, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W13 + K4); \
A = ROTL(A, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W14 + K4); \
E = ROTL(E, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W15 + K4); \
D = ROTL(D, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W00 + K4); \
C = ROTL(C, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W01 + K4); \
B = ROTL(B, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W02 + K4); \
A = ROTL(A, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W03 + K4); \
E = ROTL(E, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W04 + K4); \
D = ROTL(D, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W05 + K4); \
C = ROTL(C, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W06 + K4); \
B = ROTL(B, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W07 + K4); \
A = ROTL(A, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W08 + K4); \
E = ROTL(E, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W09 + K4); \
D = ROTL(D, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W10 + K4); \
C = ROTL(C, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W11 + K4); \
B = ROTL(B, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W12 + K4); \
A = ROTL(A, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W13 + K4); \
E = ROTL(E, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W14 + K4); \
D = ROTL(D, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W15 + K4); \
C = ROTL(C, 30); \
\
(r)[0] = SPH_T32(r[0] + A); \
(r)[1] = SPH_T32(r[1] + B); \
(r)[2] = SPH_T32(r[2] + C); \
(r)[3] = SPH_T32(r[3] + D); \
(r)[4] = SPH_T32(r[4] + E); \
} while (0)
/*
* One round of SHA-1. The data must be aligned for 32-bit access.
*/
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__aarch64__) && defined(__ARM_FEATURE_SHA2) )
static void
sha1_round( const unsigned char *data, sph_u32 r[5] )
{
sha1_transform_be( (uint32_t*)r, (uint32_t*)data, (const uint32_t*)r );
}
#else
static void
sha1_round( const unsigned char *data, sph_u32 r[5] )
{
#define SHA1_IN(x) sph_dec32be_aligned(data + (4 * (x)))
SHA1_ROUND_BODY(SHA1_IN, r);
#undef SHA1_IN
}
#endif
/* see sph_sha1.h */
void
sph_sha1_init(void *cc)
{
sph_sha1_context *sc;
sc = cc;
memcpy(sc->val, IV, sizeof IV);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN sha1_round
#define HASH sha1
#define BE32 1
#include "md_helper.c"
/* see sph_sha1.h */
void
sph_sha1_close(void *cc, void *dst)
{
sha1_close(cc, dst, 5);
sph_sha1_init(cc);
}
/* see sph_sha1.h */
void
sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha1_addbits_and_close(cc, ub, n, dst, 5);
sph_sha1_init(cc);
}
/* see sph_sha1.h */
void
sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5])
{
#define SHA1_IN(x) msg[x]
SHA1_ROUND_BODY(SHA1_IN, val);
#undef SHA1_IN
}
void sph_sha1_full( void *hash, const void *msg, size_t len )
{
sph_sha1_context cc;
sph_sha1_init( &cc );
sph_sha1( &cc, msg, len );
sph_sha1_close( &cc, hash );
}

View File

@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; memcpy_128( W, data, 16 );
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );

View File

@@ -5,11 +5,11 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA512256D_8WAY 1
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#define SHA512256D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA512256D_2WAY 1
#define SHA512256D_2WAY 1
#endif
#if defined(SHA512256D_8WAY)
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, four );
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
const v128_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
bool register_sha512256d_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA512256D_8WAY)
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)

133
algo/sha/sph_sha1.h Normal file
View File

@@ -0,0 +1,133 @@
/* $Id: sph_sha1.h 216 2010-06-08 09:46:57Z tp $ */
/**
* SHA-1 interface.
*
* SHA-1 is described in FIPS 180-1 (now superseded by FIPS 180-2, but the
* description of SHA-1 is still included and has not changed). FIPS
* standards can be found at: http://csrc.nist.gov/publications/fips/
*
* @warning A theoretical collision attack against SHA-1, with work
* factor 2^63, has been published. SHA-1 should not be used in new
* protocol designs.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_sha1.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_SHA1_H__
#define SPH_SHA1_H__
#include <stddef.h>
#include "compat/sph_types.h"
/**
* Output size (in bits) for SHA-1.
*/
#define SPH_SIZE_sha1 160
/**
* This structure is a context for SHA-1 computations: it contains the
* intermediate values and some data from the last entered block. Once
* a SHA-1 computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running SHA-1 computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[5];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_sha1_context;
/**
* Initialize a SHA-1 context. This process performs no memory allocation.
*
* @param cc the SHA-1 context (pointer to a <code>sph_sha1_context</code>)
*/
void sph_sha1_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHA-1 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_sha1(void *cc, const void *data, size_t len);
/**
* Terminate the current SHA-1 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (20 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHA-1 context
* @param dst the destination buffer
*/
void sph_sha1_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (20 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHA-1 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
/**
* Apply the SHA-1 compression function on the provided data. The
* <code>msg</code> parameter contains the 16 32-bit input blocks,
* as numerical values (hence after the big-endian decoding). The
* <code>val</code> parameter contains the 5 32-bit input blocks for
* the compression function; the output is written in place in this
* array.
*
* @param msg the message block (16 values)
* @param val the function 160-bit input and output
*/
void sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5]);
void sph_sha1_full( void *hash, const void *msg, size_t len );
#endif

View File

@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
const v128_t zero = v128_zero;
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -60,7 +60,6 @@ static const sph_u32 IV512[] = {
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
const v128_t zero = v128_zero;
v128_t p0, p1, p2, p3, x;
v128_t k00, k01, k02, k03, k10, k11, k12, k13;
v128_t *m = (v128_t*)msg;
@@ -76,39 +75,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = m[0];
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = m[1];
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = m[2];
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = m[3];
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = m[4];
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = m[5];
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = m[6];
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = m[7];
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
if ( r == 0 )
@@ -116,8 +115,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
if ( r == 1 )
@@ -125,32 +124,32 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
if ( r == 2 )
@@ -158,78 +157,78 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
// round 2, 6, 10
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p3, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p1, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
// round 3, 7, 11
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p2, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p0, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
@@ -237,73 +236,73 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
}
// round 13
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, v128_xor( k11, v128_set32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );

View File

@@ -12,23 +12,8 @@ uint32_t SIMD_IV_512[] __attribute__((aligned(64))) =
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
#if defined(__x86_64__)
#define SHUFXOR_1 0xb1 // rev64_32
#define SHUFXOR_2 0x4e // rev64
#define SHUFXOR_3 0x1b // rev32
#elif defined(__aarch64__)
#define SHUFXOR_1(x) vrev64q_u32(x)
#define SHUFXOR_2(x) v128_rev64(x)
#define SHUFXOR_3(x) v128_rev64( v128_qrev32(x) )
#else
#endif
0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
@@ -89,8 +74,8 @@ uint32_t SIMD_IV_512[] __attribute__((aligned(64))) =
#define SUM7_65 4
#define SUM7_66 5
#define PERM( z, d, a, shufxor ) \
XCAT( PERM_, XCAT( SUM7_ ## z, PERM_START ) )( d, a, shufxor )
#define PERM( p, z, d, a, shufxor ) \
XCAT( PERM_, XCAT( SUM7_ ## z, p ) )( d, a, shufxor )
#define PERM_0( d, a, shufxor ) /* XOR 1 */ \
do { \
@@ -188,16 +173,22 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
#if defined(__x86_64__)
#define shufxor(x,s) _mm_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1)
#define SHUFXOR_2(x) _mm_shuffle_epi32(x,0x4e)
#define SHUFXOR_3(x) _mm_shuffle_epi32(x,0x1b)
#elif defined(__aarch64__)
#define shufxor(x,s) XCAT( SHUFXOR_, s )(x)
#define SHUFXOR_1(x) vrev64q_u32(x)
#define SHUFXOR_2(x) v128_rev64(x)
#define SHUFXOR_3(x) v128_rev64(v128_qrev32(x))
#else
//#warning __FILE__ "Unknown or unsupported CPU architecture"
//unknown or unsupported architecture
#endif
#define shufxor(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE(x) \
v128_sub16( v128_and( x, v128_64( \
0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
@@ -513,7 +504,7 @@ static void ROUNDS512( uint32_t *state, const uint8_t *msg, uint16_t *fft )
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
#define STEP_1_( a,b,c,d,w,fun,r,s,z,p ) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
@@ -525,10 +516,10 @@ do { \
TTh = v128_add32( TTh, w##h ); \
TTl = v128_rol32( TTl, s ); \
TTh = v128_rol32( TTh, s ); \
PERM( z,d,a, shufxor ); \
PERM( p, z,d,a, shufxor ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_1( a,b,c,d,w,fun,r,s,z,p ) STEP_1_( a,b,c,d,w,fun,r,s,z,p )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
@@ -538,10 +529,10 @@ do { \
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
#define STEP( a,b,c,d,w1,w2,fun,r,s,z,p ) \
do { \
register v128u32_t TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
@@ -558,63 +549,45 @@ do { \
w##h = v128_mul16( w##h, code[z].v128 ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
do { \
register v128u32_t W0l, W1l, W2l, W3l, TTl; \
register v128u32_t W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0, p ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1, p ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2, p ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3, p ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 4);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 1);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 5);
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 2);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 6);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 3);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 0);
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0, 4 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3, 4 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_1_
#undef STEP_2
@@ -732,6 +705,9 @@ int simd512( void *hashval, const void *data, int datalen )
#undef REDUCE_FULL_S
#undef DO_REDUCE_FULL_S
#undef c1_16
#undef SHUFXOR_1
#undef SHUFXOR_2
#undef SHUFXOR_3
#endif
@@ -820,118 +796,12 @@ static const m256_v16 FFT256_Twiddle[] =
-30, 55, -58, -65, -95, -40, -98, 94 }}
};
#if 0
// generic
#define SHUFXOR_1 0xb1 // 0b10110001
#define SHUFXOR_2 0x4e // 0b01001110
#define SHUFXOR_3 0x1b // 0b00011011
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define SHUFXOR_1(x) _mm256_shuffle_epi32(x,0xb1)
#define SHUFXOR_2(x) _mm256_shuffle_epi32(x,0x4e)
#define SHUFXOR_3(x) _mm256_shuffle_epi32(x,0x1b)
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a,shufxor) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a,shufxor)
#define PERM_0(d,a,shufxor) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a,shufxor) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a,shufxor) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a,shufxor) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a,shufxor) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a,shufxor) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a,shufxor) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#endif
#define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
@@ -1262,7 +1132,7 @@ static void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
#define STEP_1_(a,b,c,d,w,fun,r,s,z,p ) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
@@ -1274,10 +1144,10 @@ do { \
TTh = _mm256_add_epi32( TTh, w##h ); \
TTl = mm256_rol_32( TTl, s ); \
TTh = mm256_rol_32( TTh, s ); \
PERM( z,d,a, shufxor2w ); \
PERM( p,z,d,a, shufxor2w ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_1( a,b,c,d,w,fun,r,s,z,p ) STEP_1_( a,b,c,d,w,fun,r,s,z,p )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
@@ -1287,10 +1157,10 @@ do { \
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
#define STEP( a,b,c,d,w1,w2,fun,r,s,z, p ) \
do { \
register __m256i TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
@@ -1307,63 +1177,45 @@ do { \
w##h = _mm256_mullo_epi16( w##h, code[z].v256 ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
do { \
register __m256i W0l, W1l, W2l, W3l, TTl; \
register __m256i W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0, p ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1, p ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2, p ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3, p ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 4);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 1);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 5);
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 2);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 6);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 3);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 0);
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0, 4 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3, 4 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_1_
#undef STEP_2
@@ -1642,6 +1494,10 @@ int simd512_2way( void *hashval, const void *data, int datalen )
return 0;
}
#undef SHUFXOR_1
#undef SHUFXOR_2
#undef SHUFXOR_3
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -1792,7 +1648,11 @@ static const m512_v16 FFT256_Twiddle4w[] =
-30, 55, -58, -65, -95, -40, -98, 94 }}
};
#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#define SHUFXOR_1(x) _mm512_shuffle_epi32(x,0xb1)
#define SHUFXOR_2(x) _mm512_shuffle_epi32(x,0x4e)
#define SHUFXOR_3(x) _mm512_shuffle_epi32(x,0x1b)
#define shufxor4w(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
@@ -2114,7 +1974,7 @@ static void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
// targetted
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
#define STEP_1_( a,b,c,d,w,fun,r,s,z,p ) \
do { \
TTl = Fl( a,b,c,fun ); \
TTh = Fh( a,b,c,fun ); \
@@ -2126,10 +1986,10 @@ do { \
TTh = _mm512_add_epi32( TTh, w##h ); \
TTl = mm512_rol_32( TTl, s ); \
TTh = mm512_rol_32( TTh, s ); \
PERM( z,d,a, shufxor4w ); \
PERM( p,z,d,a, shufxor4w ); \
} while(0)
#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z )
#define STEP_1( a,b,c,d,w,fun,r,s,z,p ) STEP_1_( a,b,c,d,w,fun,r,s,z,p )
#define STEP_2_( a,b,c,d,w,fun,r,s ) \
do { \
@@ -2139,10 +1999,10 @@ do { \
#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s )
#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
#define STEP( a,b,c,d,w1,w2,fun,r,s,z,p ) \
do { \
register __m512i TTl, TTh, Wl=w1, Wh=w2; \
STEP_1( a,b,c,d,W,fun,r,s,z ); \
STEP_1( a,b,c,d,W,fun,r,s,z,p ); \
STEP_2( a,b,c,d,W,fun,r,s ); \
} while(0);
@@ -2159,63 +2019,45 @@ do { \
w##h = _mm512_mullo_epi16( w##h, code[z].v512 ); \
} while(0)
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z,p ) \
do { \
register __m512i W0l, W1l, W2l, W3l, TTl; \
register __m512i W0h, W1h, W2h, W3h, TTh; \
MSG( W0, h0, l0, u0, z ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0,p ); \
MSG( W1, h1, l1, u1, z ); \
STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1,p ); \
MSG( W2,h2,l2,u2,z ); \
STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2,p ); \
MSG( W3,h3,l3,u3,z ); \
STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3,p ); \
STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
} while(0)
// 4 rounds with code 185
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 4);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 1);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 5);
// 4 rounds with code 233
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 2);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 6);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 3);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 0);
// 1 round as feed-forward
#define PERM_START 4
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 );
STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0, 4 );
STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1, 4 );
STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2, 4 );
STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3, 4 );
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef PERM_START
#undef STEP_1
#undef STEP_1_
#undef STEP_2

View File

@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
static __thread skein512_2x64_context skein512_2x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_2x64( void *state, const void *input )
{
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
uint32_t hash0[16] __attribute__ ((aligned (32)));
uint32_t hash1[16] __attribute__ ((aligned (32)));
skein512_2x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
dintrlv_2x64( hash0, hash1, vhash64, 512 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
intrlv_2x32( state, hash0, hash1, 256 );
}
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
uint32_t hash[8*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<1]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128u32_t *noncev = (v128u32_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skeinhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
{
extr_lane_2x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -3,16 +3,20 @@
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined(SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#elif defined(SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#elif defined(SKEIN_2WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_2x64;
gate->hash = (void*)&skeinhash_2x64;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
gate->hash = (void*)&skein2hash_8way;
#elif defined (SKEIN_4WAY)
#elif defined(SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
#elif defined(SKEIN_2WAY)
gate->scanhash = (void*)&scanhash_skein2_2x64;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
return true;
};

View File

@@ -7,6 +7,8 @@
#define SKEIN_8WAY 1
#elif defined(__AVX2__)
#define SKEIN_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SKEIN_2WAY 1
#endif
#if defined(SKEIN_8WAY)
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#elif defined(SKEIN_2WAY)
void skeinhash_2x64( void *output, const void *input );
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void skein2hash_2x64( void *output, const void *input );
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#else
void skeinhash( void *output, const void *input );

View File

@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
}
memset_zero_512( buf, buf_size >> 3 );
bcount = 0;
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
}
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
// Close
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
if ( ptr )
{
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
}
v128_memset_zero( buf, buf_size >> 3 );
bcount = 0;

View File

@@ -5,19 +5,6 @@
#if defined(SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx;
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
#elif defined(SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
uint64_t hash[16*4] __attribute__ ((aligned (64)));
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
skein512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
skein512_2x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#define MULXOR \
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
/ VH_BYTE_ALIGNMENT ) + 1;
#if defined (__AVX2__)
const __m256i k = _mm256_set1_epi32( 0x1000193 );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
const v128u32_t k = v128_32( 0x1000193 );
#endif

View File

@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
*sptr = '\0';
}
static __thread x16r_context_overlay hex_ctx;
int hex_hash( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &hex_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
}
else
{
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
{
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
simd512_ctx( &ctx.simd, hash, in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,
(const BitSequence *)in, size );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
switch ( algo )
{
case JH:
sph_jh512_init( &hex_ctx.jh );
sph_jh512( &hex_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &hex_ctx.skein );
sph_skein512( &hex_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case LUFFA:
init_luffa( &hex_ctx.luffa, 512 );
update_luffa( &hex_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &hex_ctx.cube, 512, 16, 32 );
cubehashUpdate( &hex_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &hex_ctx.hamsi );
sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
break;
case SHABAL:
sph_shabal512_init( &hex_ctx.shabal );
sph_shabal512( &hex_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &hex_ctx.whirlpool );
sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}

View File

@@ -11,29 +11,32 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#endif
#include "algo/simd/simd-hash-2way.h"
//#if defined(__aarch64__)
// #include "algo/simd/sph_simd.h"
//#endif
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/yespower/yespower.h"
//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
//#else
#else
#include "algo/echo/sph_echo.h"
#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
//#endif
#endif
#if defined(__AES__)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/simd/nist.h"
// Config
#define MINOTAUR_ALGO_COUNT 16
@@ -48,12 +51,15 @@ typedef struct TortureGarden TortureGarden;
struct TortureGarden
{
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_echo512_context echo;
sph_groestl512_context groestl;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__AES__)
hashState_fugue fugue;
#else
@@ -67,11 +73,7 @@ struct TortureGarden
cubehashParam cube;
shavite512_context shavite;
hashState_luffa luffa;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -93,9 +95,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
switch ( algo )
{
case 0:
blake512_init( &garden->blake );
blake512_update( &garden->blake, input, 64 );
blake512_close( &garden->blake, hash );
blake512_full( &garden->blake, hash, input, 64 );
break;
case 1:
sph_bmw512_init( &garden->bmw );
@@ -107,7 +107,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
cubehashUpdateDigest( &garden->cube, hash, input, 64 );
break;
case 3:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &garden->echo, hash, 512, input, 64 );
#else
sph_echo512_init( &garden->echo );
@@ -165,13 +165,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
sph_shavite512_close( &garden->shavite, hash );
break;
case 13:
#if defined(__aarch64__)
sph_simd512_init( &garden->simd );
sph_simd512( &garden->simd, input, 64);
sph_simd512_close( &garden->simd, hash );
#else
simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
#endif
simd512_ctx( &garden->simd, hash, input, 64 );
break;
case 14:
sph_skein512_init( &garden->skein );

View File

@@ -971,4 +971,405 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_2WAY)
void x16r_2x64_prehash( void *vdata, void *pdata )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16r_ctx.jh );
jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
break;
case KECCAK:
v128_bswap32_intrlv80_2x64( vdata, pdata );
keccak512_2x64_init( &x16r_ctx.keccak );
keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
v128_bswap32_80( edata, pdata );
init_luffa( &x16r_ctx.luffa, 512 );
update_luffa( &x16r_ctx.luffa, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case CUBEHASH:
{
v128_bswap32_80( edata, pdata );
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16r_ctx.hamsi );
hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16r_ctx.hamsi );
sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__)
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16r_ctx.fugue );
sph_fugue512( &x16r_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16r_ctx.shabal );
sph_shabal512( &x16r_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
case WHIRLPOOL:
v128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16r_ctx.whirlpool );
sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
}
int x16r_2x64_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16r_2x64_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = x16r_hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, size );
}
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
}
else
{
luffa_full( &ctx.luffa, hash0, 512, hash0, size );
luffa_full( &ctx.luffa, hash1, 512, hash1, size );
}
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
if ( i == 0 )
{
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
}
else
{
sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
}
break;
case SHA_512:
sha512_2x64_init( &ctx.sha512 );
if ( i == 0 )
sha512_2x64_update( &ctx.sha512, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, size );
}
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 64 );
memcpy( output+64, hash1, 64 );
return 1;
}
int x16r_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*2] __attribute__ ((aligned (64)));
if ( !x16r_2x64_hash_generic( hash, input, thrid ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
return 1;
}
int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128_t *noncev = (v128_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_2x64_prehash( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -13,10 +13,13 @@ __thread x16r_8way_context_overlay x16r_ctx;
__thread x16r_4way_context_overlay x16r_ctx;
#elif defined (X16R_2WAY)
__thread x16r_2x64_context_overlay x16r_ctx;
#endif
__thread x16r_context_overlay x16_ctx;
__thread x16r_context_overlay x16r_ref_ctx;
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
{
@@ -58,11 +61,15 @@ bool register_x16r_algo( algo_gate_t* gate )
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined (X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -76,11 +83,15 @@ bool register_x16rv2_algo( algo_gate_t* gate )
#elif defined (X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#elif defined (X16RV2_2WAY)
gate->scanhash = (void*)&scanhash_x16rv2_2x64;
gate->hash = (void*)&x16rv2_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -94,11 +105,15 @@ bool register_x16s_algo( algo_gate_t* gate )
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined (X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
//
// X16RT
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
{
int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined (X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 256.0;
return true;
};
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined (X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&x16r_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 128.0;
return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined (X21S_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
#elif defined (X16R_4WAY)
#elif defined (X21S_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#elif defined (X21S_2WAY)
gate->scanhash = (void*)&scanhash_x21s_2x64;
gate->hash = (void*)&x21s_2x64_hash;
gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;

View File

@@ -7,13 +7,15 @@
#include <unistd.h>
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha512-hash.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#endif
#if defined (__AVX2__)
//#if defined (__AVX2__)
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#endif
//#endif
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
// X16R, X16S
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16R_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#define X16RV2_8WAY 1
#define X16RT_8WAY 1
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#define X16RT_4WAY 1
#define X21S_4WAY 1
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RV2_2WAY 1
#endif
// X16RT, VEIL
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RT_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RT_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RT_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X21S_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X21S_2WAY 1
#endif
enum x16r_Algo {
BLAKE = 0,
BMW,
@@ -167,7 +182,6 @@ union _x16r_4way_context_overlay
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
hashState_luffa luffa1;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -187,34 +201,84 @@ int scanhash_x16r_4way( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_4way_context_overlay x16r_ctx;
#elif defined(X16R_2WAY)
union _x16r_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
} __attribute__ ((aligned (64)));
typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
void x16r_2x64_prehash( void *, void * );
int x16r_2x64_hash_generic( void *, const void *, int );
int x16r_2x64_hash( void *, const void *, int );
int scanhash_x16r_2x64( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_2x64_context_overlay x16r_ctx;
#endif
// need a reference, add hooks for SSE2.
// needed for hex
union _x16r_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
hashState_sd simd;
sph_echo512_context echo;
#endif
sph_hamsi512_context hamsi;
#if defined(__AES__)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha512;
@@ -222,7 +286,7 @@ union _x16r_context_overlay
typedef union _x16r_context_overlay x16r_context_overlay;
extern __thread x16r_context_overlay x16_ctx;
extern __thread x16r_context_overlay x16r_ref_ctx;
void x16r_prehash( void *, void * );
int x16r_hash_generic( void *, const void *, int );
@@ -242,6 +306,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_2WAY)
int x16rv2_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +321,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
#endif
// x16rt, veil
#if defined(X16R_8WAY)
#if defined(X16RT_8WAY)
//void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
#elif defined(X16RT_4WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RT_2WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
//void x16rt_hash( void *state, const void *input );
@@ -272,20 +348,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
#endif
// x21s
#if defined(X16R_8WAY)
#if defined(X21S_8WAY)
int x21s_8way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X16R_4WAY)
#elif defined(X21S_4WAY)
int x21s_4way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#elif defined(X21S_2WAY)
int x21s_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_2x64_thread_init();
#else
int x21s_hash( void *state, const void *input, int thr_id );

View File

@@ -18,32 +18,36 @@ void x16r_prehash( void *edata, void *pdata )
switch ( algo )
{
case JH:
sph_jh512_init( &x16_ctx.jh );
sph_jh512( &x16_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &x16_ctx.skein );
sph_skein512( &x16_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case KECCAK:
sph_keccak512_init( &x16r_ref_ctx.keccak );
sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
break;
case LUFFA:
init_luffa( &x16_ctx.luffa, 512 );
update_luffa( &x16_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &x16_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &x16_ctx.hamsi );
sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
break;
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
break;
case SHABAL:
sph_shabal512_init( &x16_ctx.shabal );
sph_shabal512( &x16_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &x16_ctx.whirlpool );
sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}
}
@@ -52,7 +56,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &x16_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
@@ -70,36 +74,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
sph_bmw512( &ctx.bmw, in, size );
sph_bmw512_close( &ctx.bmw, hash );
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512_close( &ctx.groestl, hash );
#endif
break;
case JH:
if ( i == 0 )
sph_jh512(&ctx.jh, in+64, 16 );
sph_jh512( &ctx.jh, in+64, 16 );
else
{
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512( &ctx.jh, in, size );
}
sph_jh512_close(&ctx.jh, hash );
sph_jh512_close( &ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
if ( i == 0 )
sph_keccak512( &ctx.keccak, in+72, 8 );
else
{
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
}
sph_keccak512_close( &ctx.keccak, hash );
break;
case SKEIN:
if ( i == 0 )
sph_skein512(&ctx.skein, in+64, 16 );
sph_skein512( &ctx.skein, in+64, 16 );
else
{
sph_skein512_init( &ctx.skein );
@@ -109,13 +118,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case LUFFA:
if ( i == 0 )
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
else
luffa_full( &ctx.luffa, hash, 512, in, size );
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
cubehash_full( &ctx.cube, hash, 512, in, size );
break;
@@ -123,19 +132,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
sph_simd512( &ctx.simd, hash, size );
sph_simd512_close( &ctx.simd, hash );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence*)hash, 512,
(const BitSequence*)in, size );
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
@@ -144,7 +147,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
sph_hamsi512( &ctx.hamsi, in+64, 16 );
sph_hamsi512( &ctx.hamsi, in+72, 8 );
else
{
sph_hamsi512_init( &ctx.hamsi );
@@ -153,12 +156,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
sph_fugue512_full( &ctx.fugue, hash, in, size );
break;
case SHABAL:
if ( i == 0 )
sph_shabal512( &ctx.shabal, in+64, 16 );

View File

@@ -3,7 +3,7 @@
#include <stdlib.h>
#include <string.h>
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RT_2WAY)
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_2x64_prehash( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,6 +1,6 @@
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )

View File

@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
hash7, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in4 + 76, 4 );
fugue512_final( &ctx.fugue, hash4 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in5 + 76, 4 );
fugue512_final( &ctx.fugue, hash5 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in6 + 76, 4 );
fugue512_final( &ctx.fugue, hash6 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in7 + 76, 4 );
fugue512_final( &ctx.fugue, hash7 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
fugue512_full( &ctx.fugue, hash4, hash4, size );
fugue512_full( &ctx.fugue, hash5, hash5, size );
fugue512_full( &ctx.fugue, hash6, hash6, size );
fugue512_full( &ctx.fugue, hash7, hash7, size );
}
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -588,7 +617,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
@@ -626,7 +655,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -824,8 +860,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way_update( &ctx.skein, vhash, size );
skein512_4way_close( &ctx.skein, vhash );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case LUFFA:
@@ -945,7 +981,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +992,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
}
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1077,7 +1130,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
@@ -1101,7 +1154,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
@@ -1112,7 +1165,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1210,453 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RV2_2WAY)
union _x16rv2_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
sph_tiger_context tiger;
} __attribute__ ((aligned (64)));
typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
static inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
int x16rv2_2x64_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16rv2_2x64_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = x16r_hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, 64 );
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
break;
case SHA_512:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, 64 );
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
return 1;
}
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16rv2_ctx.jh );
jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
v128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16rv2_ctx.hamsi );
hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16rv2_ctx.hamsi );
sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__)
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16rv2_ctx.fugue );
sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16rv2_ctx.shabal );
sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -6,21 +6,15 @@
*/
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)
#include "algo/tiger/sph_tiger.h"
union _x16rv2_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512(&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );

View File

@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include "algo/haval/haval-hash-4way.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
@@ -351,4 +352,119 @@ bool x21s_4way_thread_init()
return x21s_4way_matrix;
}
#elif defined (X21S_2WAY)
static __thread uint64_t* x21s_2x64_matrix;
union _x21s_2x64_context_overlay
{
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
} __attribute__ ((aligned (64)));
typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
int x21s_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t shash[64*2] __attribute__ ((aligned (64)));
x21s_2x64_context_overlay ctx;
uint32_t *hash0 = (uint32_t*) shash;
uint32_t *hash1 = (uint32_t*)( shash+64 );
if ( !x16r_2x64_hash_generic( shash, input, thrid ) )
return 0;
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash0, 64 );
sph_haval256_5_close( &ctx.haval, hash0 );
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash1, 64 );
sph_haval256_5_close( &ctx.haval, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
(const void*) hash0, 32, 1, 4, 4 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
(const void*) hash1, 32, 1, 4, 4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sha256_full( output, hash0, 64 );
sha256_full( output+32, hash1, 64 );
return 1;
}
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_2x64_prehash( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_2x64_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_2x64_matrix = mm_malloc( size, 64 );
return x21s_2x64_matrix;
}
#endif

View File

@@ -15,7 +15,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X21S_8WAY) && !defined(X21S_4WAY)
static __thread uint64_t* x21s_matrix;

View File

@@ -931,15 +931,19 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
// Need sph in some cases
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/hamsi/sph_hamsi.h"
//#include "algo/simd/sph_simd.h"
//#include "algo/simd/nist.h"
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
#include "algo/hamsi/sph_hamsi.h"
#endif
#include "algo/shabal/sph_shabal.h"
#include "algo/haval/sph-haval.h"
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#endif
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/echo/sph_echo.h"
//#endif
#endif
#include "algo/fugue/sph_fugue.h"
union _x17_context_overlay
@@ -951,7 +955,7 @@ union _x17_context_overlay
#else
sph_groestl512_context groestl;
#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -967,12 +971,8 @@ union _x17_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__x86_64__)
simd512_context simd;
#else
sph_simd512_context simd;
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
@@ -1033,19 +1033,10 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
#if defined(__x86_64__)
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#else
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash0, 64 );
sph_simd512_close( &ctx.simd, hash0 );
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash1, 64 );
sph_simd512_close( &ctx.simd, hash1 );
#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
#else
@@ -1057,7 +1048,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_echo512_close( &ctx.echo, hash1 );
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
intrlv_2x64( vhash, hash0, hash1, 512 );
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
@@ -1142,14 +1133,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
pdata[19] = bswap_32( n );
// pdata[19] = n;
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}

View File

@@ -5,24 +5,23 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -41,12 +40,15 @@ union _x22i_context_overlay
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -84,9 +82,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
groestl512_full( &ctx.groestl, hash, hash, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init(&ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, hash, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash, 64 );
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
int scanhash_x22i( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];

View File

@@ -5,24 +5,23 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -44,12 +43,15 @@ union _x25x_context_overlay
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -89,9 +87,7 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_bmw512_close(&ctx.bmw, &hash[1]);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
(const char*)&hash[1], 512 );
groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
init_luffa( &ctx.luffa, 512 );
luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
sph_shavite512_close(&ctx.shavite, &hash[8]);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
sph_simd512_close(&ctx.simd, &hash[9] );
#else
update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
(const BitSequence *)&hash[8], 512 );
#endif
simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
(const BitSequence*)&hash[9], 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
int scanhash_x25x( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x25x_hash( hash64, edata, thr_id ) )
if ( x25x_hash( hash64, edata, thr_id ) );
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

13
api.c
View File

@@ -8,6 +8,7 @@
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#define APIVERSION "1.0"
#ifdef WIN32
@@ -27,9 +28,9 @@
#include <math.h>
#include <stdarg.h>
#include <assert.h>
#include <openssl/sha.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "algo/sha/sha1-hash.h"
#include "miner.h"
#include "sysinfos.c"
@@ -208,7 +209,7 @@ static char *remote_seturl(char *params)
return buffer;
}
/**
/*-hash*
* Ask the miner to quit
*/
static char *remote_quit(char *params)
@@ -336,7 +337,6 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
char inpkey[128] = { 0 };
char seckey[64];
uchar sha1[20];
// SHA_CTX ctx;
if (opt_protocol)
applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,11 +346,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
SHA1( inpkey, strlen(inpkey), sha1 );
// Deprecated in openssl-3
// SHA1_Init(&ctx);
// SHA1_Update(&ctx, inpkey, strlen(inpkey));
// SHA1_Final(sha1, &ctx);
sph_sha1_full( sha1, inpkey, strlen(inpkey) );
base64_encode(sha1, 20, seckey, sizeof(seckey));
@@ -733,3 +729,4 @@ void *api_thread(void *userdata)
return NULL;
}

View File

@@ -40,4 +40,3 @@ rm -f config.status
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
strip -s cpuminer
mv cpuminer cpuminer

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,830 +0,0 @@
/*
* Copyright 2011-2012, 2014 pooler@litecoinpool.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <cpuminer-config.h>
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(USE_ASM) && defined(__i386__)
.macro scrypt_shuffle src, so, dest, do
movl \so+60(\src), %eax
movl \so+44(\src), %ebx
movl \so+28(\src), %ecx
movl \so+12(\src), %edx
movl %eax, \do+12(\dest)
movl %ebx, \do+28(\dest)
movl %ecx, \do+44(\dest)
movl %edx, \do+60(\dest)
movl \so+40(\src), %eax
movl \so+8(\src), %ebx
movl \so+48(\src), %ecx
movl \so+16(\src), %edx
movl %eax, \do+8(\dest)
movl %ebx, \do+40(\dest)
movl %ecx, \do+16(\dest)
movl %edx, \do+48(\dest)
movl \so+20(\src), %eax
movl \so+4(\src), %ebx
movl \so+52(\src), %ecx
movl \so+36(\src), %edx
movl %eax, \do+4(\dest)
movl %ebx, \do+20(\dest)
movl %ecx, \do+36(\dest)
movl %edx, \do+52(\dest)
movl \so+0(\src), %eax
movl \so+24(\src), %ebx
movl \so+32(\src), %ecx
movl \so+56(\src), %edx
movl %eax, \do+0(\dest)
movl %ebx, \do+24(\dest)
movl %ecx, \do+32(\dest)
movl %edx, \do+56(\dest)
.endm
.macro salsa8_core_gen_quadround
movl 52(%esp), %ecx
movl 4(%esp), %edx
movl 20(%esp), %ebx
movl 8(%esp), %esi
leal (%ecx, %edx), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 4(%esp)
movl 36(%esp), %edi
leal (%edx, %ebx), %ebp
roll $9, %ebp
xorl %ebp, %edi
movl 24(%esp), %ebp
movl %edi, 8(%esp)
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 40(%esp), %ebx
movl %ecx, 20(%esp)
addl %edi, %ecx
roll $18, %ecx
leal (%esi, %ebp), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 24(%esp)
movl 56(%esp), %edi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %edi
movl %edi, 36(%esp)
movl 28(%esp), %ecx
movl %edx, 28(%esp)
movl 44(%esp), %edx
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %esi
movl 60(%esp), %ebx
movl %esi, 40(%esp)
addl %edi, %esi
roll $18, %esi
leal (%ecx, %edx), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 44(%esp)
movl 12(%esp), %edi
xorl %esi, %ebp
leal (%edx, %ebx), %esi
roll $9, %esi
xorl %esi, %edi
movl %edi, 12(%esp)
movl 48(%esp), %esi
movl %ebp, 48(%esp)
movl 64(%esp), %ebp
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 16(%esp), %ebx
movl %ecx, 16(%esp)
addl %edi, %ecx
roll $18, %ecx
leal (%esi, %ebp), %edi
roll $7, %edi
xorl %edi, %ebx
movl 32(%esp), %edi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %edi
movl %edi, 32(%esp)
movl %ebx, %ecx
movl %edx, 52(%esp)
movl 28(%esp), %edx
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %esi
movl 40(%esp), %ebx
movl %esi, 28(%esp)
addl %edi, %esi
roll $18, %esi
leal (%ecx, %edx), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 40(%esp)
movl 12(%esp), %edi
xorl %esi, %ebp
leal (%edx, %ebx), %esi
roll $9, %esi
xorl %esi, %edi
movl %edi, 12(%esp)
movl 4(%esp), %esi
movl %ebp, 4(%esp)
movl 48(%esp), %ebp
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 16(%esp), %ebx
movl %ecx, 16(%esp)
addl %edi, %ecx
roll $18, %ecx
leal (%esi, %ebp), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 48(%esp)
movl 32(%esp), %edi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %edi
movl %edi, 32(%esp)
movl 24(%esp), %ecx
movl %edx, 24(%esp)
movl 52(%esp), %edx
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %esi
movl 28(%esp), %ebx
movl %esi, 28(%esp)
addl %edi, %esi
roll $18, %esi
leal (%ecx, %edx), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 52(%esp)
movl 8(%esp), %edi
xorl %esi, %ebp
leal (%edx, %ebx), %esi
roll $9, %esi
xorl %esi, %edi
movl %edi, 8(%esp)
movl 44(%esp), %esi
movl %ebp, 44(%esp)
movl 4(%esp), %ebp
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 20(%esp), %ebx
movl %ecx, 4(%esp)
addl %edi, %ecx
roll $18, %ecx
leal (%esi, %ebp), %edi
roll $7, %edi
xorl %edi, %ebx
movl 36(%esp), %edi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %edi
movl %edi, 20(%esp)
movl %ebx, %ecx
movl %edx, 36(%esp)
movl 24(%esp), %edx
addl %edi, %ebx
roll $13, %ebx
xorl %ebx, %esi
movl 28(%esp), %ebx
movl %esi, 24(%esp)
addl %edi, %esi
roll $18, %esi
leal (%ecx, %edx), %edi
roll $7, %edi
xorl %edi, %ebx
movl %ebx, 28(%esp)
xorl %esi, %ebp
movl 8(%esp), %esi
leal (%edx, %ebx), %edi
roll $9, %edi
xorl %edi, %esi
movl 40(%esp), %edi
movl %ebp, 8(%esp)
movl 44(%esp), %ebp
movl %esi, 40(%esp)
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 4(%esp), %ebx
movl %ecx, 44(%esp)
addl %esi, %ecx
roll $18, %ecx
leal (%edi, %ebp), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 4(%esp)
movl 20(%esp), %esi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %esi
movl %esi, 56(%esp)
movl 48(%esp), %ecx
movl %edx, 20(%esp)
movl 36(%esp), %edx
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %edi
movl 24(%esp), %ebx
movl %edi, 24(%esp)
addl %esi, %edi
roll $18, %edi
leal (%ecx, %edx), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 60(%esp)
movl 12(%esp), %esi
xorl %edi, %ebp
leal (%edx, %ebx), %edi
roll $9, %edi
xorl %edi, %esi
movl %esi, 12(%esp)
movl 52(%esp), %edi
movl %ebp, 36(%esp)
movl 8(%esp), %ebp
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 16(%esp), %ebx
movl %ecx, 16(%esp)
addl %esi, %ecx
roll $18, %ecx
leal (%edi, %ebp), %esi
roll $7, %esi
xorl %esi, %ebx
movl 32(%esp), %esi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %esi
movl %esi, 32(%esp)
movl %ebx, %ecx
movl %edx, 48(%esp)
movl 20(%esp), %edx
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %edi
movl 24(%esp), %ebx
movl %edi, 20(%esp)
addl %esi, %edi
roll $18, %edi
leal (%ecx, %edx), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 8(%esp)
movl 12(%esp), %esi
xorl %edi, %ebp
leal (%edx, %ebx), %edi
roll $9, %edi
xorl %edi, %esi
movl %esi, 12(%esp)
movl 28(%esp), %edi
movl %ebp, 52(%esp)
movl 36(%esp), %ebp
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 16(%esp), %ebx
movl %ecx, 16(%esp)
addl %esi, %ecx
roll $18, %ecx
leal (%edi, %ebp), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 28(%esp)
movl 32(%esp), %esi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %esi
movl %esi, 32(%esp)
movl 4(%esp), %ecx
movl %edx, 4(%esp)
movl 48(%esp), %edx
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %edi
movl 20(%esp), %ebx
movl %edi, 20(%esp)
addl %esi, %edi
roll $18, %edi
leal (%ecx, %edx), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 48(%esp)
movl 40(%esp), %esi
xorl %edi, %ebp
leal (%edx, %ebx), %edi
roll $9, %edi
xorl %edi, %esi
movl %esi, 36(%esp)
movl 60(%esp), %edi
movl %ebp, 24(%esp)
movl 52(%esp), %ebp
addl %esi, %ebx
roll $13, %ebx
xorl %ebx, %ecx
movl 44(%esp), %ebx
movl %ecx, 40(%esp)
addl %esi, %ecx
roll $18, %ecx
leal (%edi, %ebp), %esi
roll $7, %esi
xorl %esi, %ebx
movl %ebx, 52(%esp)
movl 56(%esp), %esi
xorl %ecx, %edx
leal (%ebp, %ebx), %ecx
roll $9, %ecx
xorl %ecx, %esi
movl %esi, 56(%esp)
addl %esi, %ebx
movl %edx, 44(%esp)
roll $13, %ebx
xorl %ebx, %edi
movl %edi, 60(%esp)
addl %esi, %edi
roll $18, %edi
xorl %edi, %ebp
movl %ebp, 64(%esp)
.endm
.text
.p2align 5
salsa8_core_gen:
salsa8_core_gen_quadround
salsa8_core_gen_quadround
ret
.text
.p2align 5
.globl scrypt_core
.globl _scrypt_core
scrypt_core:
_scrypt_core:
pushl %ebx
pushl %ebp
pushl %edi
pushl %esi
/* Check for SSE2 availability */
movl $1, %eax
cpuid
andl $0x04000000, %edx
jnz scrypt_core_sse2
scrypt_core_gen:
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %ecx
subl $72, %esp
.macro scrypt_core_macro1a p, q
movl \p(%edi), %eax
movl \q(%edi), %edx
movl %eax, \p(%esi)
movl %edx, \q(%esi)
xorl %edx, %eax
movl %eax, \p(%edi)
movl %eax, \p(%esp)
.endm
.macro scrypt_core_macro1b p, q
movl \p(%edi), %eax
xorl \p(%esi, %edx), %eax
movl \q(%edi), %ebx
xorl \q(%esi, %edx), %ebx
movl %ebx, \q(%edi)
xorl %ebx, %eax
movl %eax, \p(%edi)
movl %eax, \p(%esp)
.endm
.macro scrypt_core_macro2 p, q
movl \p(%esp), %eax
addl \p(%edi), %eax
movl %eax, \p(%edi)
xorl \q(%edi), %eax
movl %eax, \q(%edi)
movl %eax, \p(%esp)
.endm
.macro scrypt_core_macro3 p, q
movl \p(%esp), %eax
addl \q(%edi), %eax
movl %eax, \q(%edi)
.endm
shll $7, %ecx
addl %esi, %ecx
scrypt_core_gen_loop1:
movl %esi, 64(%esp)
movl %ecx, 68(%esp)
scrypt_core_macro1a 0, 64
scrypt_core_macro1a 4, 68
scrypt_core_macro1a 8, 72
scrypt_core_macro1a 12, 76
scrypt_core_macro1a 16, 80
scrypt_core_macro1a 20, 84
scrypt_core_macro1a 24, 88
scrypt_core_macro1a 28, 92
scrypt_core_macro1a 32, 96
scrypt_core_macro1a 36, 100
scrypt_core_macro1a 40, 104
scrypt_core_macro1a 44, 108
scrypt_core_macro1a 48, 112
scrypt_core_macro1a 52, 116
scrypt_core_macro1a 56, 120
scrypt_core_macro1a 60, 124
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro2 0, 64
scrypt_core_macro2 4, 68
scrypt_core_macro2 8, 72
scrypt_core_macro2 12, 76
scrypt_core_macro2 16, 80
scrypt_core_macro2 20, 84
scrypt_core_macro2 24, 88
scrypt_core_macro2 28, 92
scrypt_core_macro2 32, 96
scrypt_core_macro2 36, 100
scrypt_core_macro2 40, 104
scrypt_core_macro2 44, 108
scrypt_core_macro2 48, 112
scrypt_core_macro2 52, 116
scrypt_core_macro2 56, 120
scrypt_core_macro2 60, 124
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro3 0, 64
scrypt_core_macro3 4, 68
scrypt_core_macro3 8, 72
scrypt_core_macro3 12, 76
scrypt_core_macro3 16, 80
scrypt_core_macro3 20, 84
scrypt_core_macro3 24, 88
scrypt_core_macro3 28, 92
scrypt_core_macro3 32, 96
scrypt_core_macro3 36, 100
scrypt_core_macro3 40, 104
scrypt_core_macro3 44, 108
scrypt_core_macro3 48, 112
scrypt_core_macro3 52, 116
scrypt_core_macro3 56, 120
scrypt_core_macro3 60, 124
movl 64(%esp), %esi
movl 68(%esp), %ecx
addl $128, %esi
cmpl %ecx, %esi
jne scrypt_core_gen_loop1
movl 96(%esp), %esi
movl 100(%esp), %ecx
movl %ecx, %eax
subl $1, %eax
movl %eax, 100(%esp)
scrypt_core_gen_loop2:
movl %ecx, 68(%esp)
movl 64(%edi), %edx
andl 100(%esp), %edx
shll $7, %edx
scrypt_core_macro1b 0, 64
scrypt_core_macro1b 4, 68
scrypt_core_macro1b 8, 72
scrypt_core_macro1b 12, 76
scrypt_core_macro1b 16, 80
scrypt_core_macro1b 20, 84
scrypt_core_macro1b 24, 88
scrypt_core_macro1b 28, 92
scrypt_core_macro1b 32, 96
scrypt_core_macro1b 36, 100
scrypt_core_macro1b 40, 104
scrypt_core_macro1b 44, 108
scrypt_core_macro1b 48, 112
scrypt_core_macro1b 52, 116
scrypt_core_macro1b 56, 120
scrypt_core_macro1b 60, 124
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro2 0, 64
scrypt_core_macro2 4, 68
scrypt_core_macro2 8, 72
scrypt_core_macro2 12, 76
scrypt_core_macro2 16, 80
scrypt_core_macro2 20, 84
scrypt_core_macro2 24, 88
scrypt_core_macro2 28, 92
scrypt_core_macro2 32, 96
scrypt_core_macro2 36, 100
scrypt_core_macro2 40, 104
scrypt_core_macro2 44, 108
scrypt_core_macro2 48, 112
scrypt_core_macro2 52, 116
scrypt_core_macro2 56, 120
scrypt_core_macro2 60, 124
call salsa8_core_gen
movl 92(%esp), %edi
movl 96(%esp), %esi
scrypt_core_macro3 0, 64
scrypt_core_macro3 4, 68
scrypt_core_macro3 8, 72
scrypt_core_macro3 12, 76
scrypt_core_macro3 16, 80
scrypt_core_macro3 20, 84
scrypt_core_macro3 24, 88
scrypt_core_macro3 28, 92
scrypt_core_macro3 32, 96
scrypt_core_macro3 36, 100
scrypt_core_macro3 40, 104
scrypt_core_macro3 44, 108
scrypt_core_macro3 48, 112
scrypt_core_macro3 52, 116
scrypt_core_macro3 56, 120
scrypt_core_macro3 60, 124
movl 68(%esp), %ecx
subl $1, %ecx
ja scrypt_core_gen_loop2
addl $72, %esp
popl %esi
popl %edi
popl %ebp
popl %ebx
ret
.macro salsa8_core_sse2_doubleround
movdqa %xmm1, %xmm4
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm0, %xmm4
pxor %xmm5, %xmm3
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm3, %xmm4
pxor %xmm5, %xmm2
pshufd $0x93, %xmm3, %xmm3
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm2, %xmm4
pxor %xmm5, %xmm1
pshufd $0x4e, %xmm2, %xmm2
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
movdqa %xmm3, %xmm4
pxor %xmm5, %xmm0
pshufd $0x39, %xmm1, %xmm1
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm0, %xmm4
pxor %xmm5, %xmm1
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm1, %xmm4
pxor %xmm5, %xmm2
pshufd $0x93, %xmm1, %xmm1
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm2, %xmm4
pxor %xmm5, %xmm3
pshufd $0x4e, %xmm2, %xmm2
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm3, %xmm3
pxor %xmm5, %xmm0
.endm
.macro salsa8_core_sse2
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
.endm
.p2align 5
scrypt_core_sse2:
movl 20(%esp), %edi
movl 24(%esp), %esi
movl %esp, %ebp
subl $128, %esp
andl $-16, %esp
scrypt_shuffle %edi, 0, %esp, 0
scrypt_shuffle %edi, 64, %esp, 64
movdqa 96(%esp), %xmm6
movdqa 112(%esp), %xmm7
movl %esi, %edx
movl 28(%ebp), %ecx
shll $7, %ecx
addl %esi, %ecx
scrypt_core_sse2_loop1:
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2
movdqa 48(%esp), %xmm3
movdqa 64(%esp), %xmm4
movdqa 80(%esp), %xmm5
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
movdqa %xmm0, 0(%edx)
movdqa %xmm1, 16(%edx)
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm2, 32(%edx)
movdqa %xmm3, 48(%edx)
movdqa %xmm4, 64(%edx)
movdqa %xmm5, 80(%edx)
movdqa %xmm6, 96(%edx)
movdqa %xmm7, 112(%edx)
salsa8_core_sse2
paddd 0(%edx), %xmm0
paddd 16(%edx), %xmm1
paddd 32(%edx), %xmm2
paddd 48(%edx), %xmm3
movdqa %xmm0, 0(%esp)
movdqa %xmm1, 16(%esp)
movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp)
pxor 64(%esp), %xmm0
pxor 80(%esp), %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm0, 64(%esp)
movdqa %xmm1, 80(%esp)
movdqa %xmm2, %xmm6
movdqa %xmm3, %xmm7
salsa8_core_sse2
paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1
paddd %xmm2, %xmm6
paddd %xmm3, %xmm7
movdqa %xmm0, 64(%esp)
movdqa %xmm1, 80(%esp)
addl $128, %edx
cmpl %ecx, %edx
jne scrypt_core_sse2_loop1
movdqa 64(%esp), %xmm4
movdqa 80(%esp), %xmm5
movl 28(%ebp), %ecx
movl %ecx, %eax
subl $1, %eax
scrypt_core_sse2_loop2:
movd %xmm4, %edx
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2
movdqa 48(%esp), %xmm3
andl %eax, %edx
shll $7, %edx
pxor 0(%esi, %edx), %xmm0
pxor 16(%esi, %edx), %xmm1
pxor 32(%esi, %edx), %xmm2
pxor 48(%esi, %edx), %xmm3
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
movdqa %xmm0, 0(%esp)
movdqa %xmm1, 16(%esp)
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp)
salsa8_core_sse2
paddd 0(%esp), %xmm0
paddd 16(%esp), %xmm1
paddd 32(%esp), %xmm2
paddd 48(%esp), %xmm3
movdqa %xmm0, 0(%esp)
movdqa %xmm1, 16(%esp)
movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp)
pxor 64(%esi, %edx), %xmm0
pxor 80(%esi, %edx), %xmm1
pxor 96(%esi, %edx), %xmm2
pxor 112(%esi, %edx), %xmm3
pxor 64(%esp), %xmm0
pxor 80(%esp), %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm0, 64(%esp)
movdqa %xmm1, 80(%esp)
movdqa %xmm2, %xmm6
movdqa %xmm3, %xmm7
salsa8_core_sse2
paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1
paddd %xmm2, %xmm6
paddd %xmm3, %xmm7
movdqa %xmm0, %xmm4
movdqa %xmm1, %xmm5
movdqa %xmm0, 64(%esp)
movdqa %xmm1, 80(%esp)
subl $1, %ecx
ja scrypt_core_sse2_loop2
movdqa %xmm6, 96(%esp)
movdqa %xmm7, 112(%esp)
scrypt_shuffle %esp, 0, %edi, 0
scrypt_shuffle %esp, 64, %edi, 64
movl %ebp, %esp
popl %esi
popl %edi
popl %ebp
popl %ebx
ret
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.7.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.12.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.7'
PACKAGE_STRING='cpuminer-opt 23.7'
PACKAGE_VERSION='23.12'
PACKAGE_STRING='cpuminer-opt 23.12'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.7 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.12 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.7:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.12:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.7
cpuminer-opt configure 23.12
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.7, which was
It was created by cpuminer-opt $as_me 23.12, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.7'
VERSION='23.12'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.7, which was
This file was extended by cpuminer-opt $as_me 23.12, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 23.7
cpuminer-opt config.status 23.12
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [23.7])
AC_INIT([cpuminer-opt], [23.12])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.7.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.12.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.7'
PACKAGE_STRING='cpuminer-opt 23.7'
PACKAGE_VERSION='23.12'
PACKAGE_STRING='cpuminer-opt 23.12'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.7 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.12 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.7:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.12:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.7
cpuminer-opt configure 23.12
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.7, which was
It was created by cpuminer-opt $as_me 23.12, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.7'
VERSION='23.12'
cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.7, which was
This file was extended by cpuminer-opt $as_me 23.12, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 23.7
cpuminer-opt config.status 23.12
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -36,7 +36,7 @@
#include <memory.h>
#include <curl/curl.h>
#include <jansson.h>
#include <openssl/sha.h>
//#include <openssl/sha.h>
//#include <mm_malloc.h>
#include "sysinfos.c"
#include "algo/sha/sha256d.h"
@@ -1967,18 +1967,6 @@ void sha256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
sha256d( merkle_root, merkle_root, 64 );
}
}
/*
// OpenSSL single sha256, deprecated
void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
{
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
for ( int i = 0; i < sctx->job.merkle_count; i++ )
{
memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
sha256d( merkle_root, merkle_root, 64 );
}
}
*/
// Default is do_nothing (assumed LE)
void set_work_data_big_endian( struct work *work )
@@ -2212,8 +2200,8 @@ static void *miner_thread( void *userdata )
// int64_t max64 = 1000;
int nonce_found = 0;
if ( likely( algo_gate.do_this_thread( thr_id ) ) )
{
// if ( likely( algo_gate.do_this_thread( thr_id ) ) )
// {
if ( have_stratum )
{
while ( unlikely( stratum_down ) )
@@ -2262,8 +2250,8 @@ static void *miner_thread( void *userdata )
pthread_rwlock_unlock( &g_work_lock );
} // do_this_thread
algo_gate.resync_threads( thr_id, &work );
// } // do_this_thread
// algo_gate.resync_threads( thr_id, &work );
// conditional mining
if ( unlikely( !wanna_mine( thr_id ) ) )
@@ -2980,8 +2968,12 @@ static bool cpu_capability( bool display_only )
printf(" Linux\n");
#elif defined(WIN32)
printf(" Windows\n");
#elif defined(__APPLE__)
printf(" MacOS\n");
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
printf(" Unix\n");
#else
printf("\n");
printf("\n");
#endif
printf("CPU features: ");
@@ -3685,8 +3677,8 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0);
#include "simd-utils.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "compat/aes_helper.c"
int main(int argc, char *argv[])
{

38
miner.h
View File

@@ -1,38 +1,45 @@
#ifndef __MINER_H__
#define __MINER_H__
#ifndef MINER_H__
#define MINER_H__
#include <cpuminer-config.h>
#if !( defined(__SSE2__) || ( defined(__aarch64__) && defined(__ARM_NEON) ) )
#warning "Unknown or unsupported CPU, requires x86_64 with SSE2 or AArch64 with NEON."
#endif
#if defined(__x86_64__)
#define USER_AGENT_ARCH "x64"
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
#elif defined(__aarch64__)
#define USER_AGENT_ARCH "arm"
#define USER_AGENT_ARCH "arm" // AArch64
//#elif
// #define USER_AGENT_ARCH "r5" // RISC-V
#else
#define USER_AGENT_ARCH
#endif
#if defined(__linux)
#define USER_AGENT_OS "L"
#define USER_AGENT_OS "L" // GNU Linux
#elif defined(WIN32)
#define USER_AGENT_OS "W"
#define USER_AGENT_OS "W" // MS Windows
#elif defined(__APPLE__)
#define USER_AGENT_OS "M" // Apple MacOS
// is there a generic BSD macro?
#elif defined(__unix__) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
#define USER_AGENT_OS "U" // BSD unix
#else
#define USER_AGENT_OS
#endif
#define USER_AGENT PACKAGE_NAME "-" PACKAGE_VERSION "-" USER_AGENT_ARCH USER_AGENT_OS
//#define MAX_CPUS 128
/*
#ifdef _MSC_VER
#undef USE_ASM /* to fix */
#undef USE_ASM
#ifdef NOASM
#undef USE_ASM
#endif
/* missing arch defines for msvc */
#if defined(_M_X64)
#define __i386__ 1
#define __x86_64__ 1
@@ -40,8 +47,8 @@
#define __i386__ 1
#endif
#endif /* _MSC_VER */
#endif
*/
#include <stdbool.h>
#include <inttypes.h>
@@ -75,7 +82,7 @@
#endif
//TODO for windows
static inline bool is_root()
{
#if defined(WIN32)
@@ -607,7 +614,6 @@ enum algos {
ALGO_GROESTL,
ALGO_HEX,
ALGO_HMQ1725,
ALGO_HODL,
ALGO_JHA,
ALGO_KECCAK,
ALGO_KECCAKC,
@@ -703,7 +709,6 @@ static const char* const algo_names[] = {
"groestl",
"hex",
"hmq1725",
"hodl",
"jha",
"keccak",
"keccakc",
@@ -865,7 +870,6 @@ Options:\n\
groestl Groestl coin\n\
hex x16r-hex\n\
hmq1725 Espers\n\
hodl Hodlcoin\n\
jha jackppot (Jackpotcoin)\n\
keccak Maxcoin\n\
keccakc Creative Coin\n\

View File

@@ -411,11 +411,11 @@ static inline void v128_bswap32_80( void *d, void *s )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf );
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -461,11 +461,11 @@ static inline void v128_bswap32_80( void *d, void *s )
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
#if defined(__SSSE3__)
@@ -480,38 +480,38 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
#else
s0 = mm128_bswap_32( s0 );
s1 = mm128_bswap_32( s1 );
s2 = mm128_bswap_32( s2 );
s3 = mm128_bswap_32( s3 );
s4 = mm128_bswap_32( s4 );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_m128i( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_m128i( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
casti_m128i( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_m128i( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
casti_m128i( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_m128i( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_m128i( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
casti_m128i( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_m128i( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_m128i( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_m128i( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
casti_m128i( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_m128i( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
@@ -797,11 +797,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
const __m256i c1 = v256_32( 0x04050607 );
const __m256i c2 = v256_32( 0x08090a0b );
const __m256i c3 = v256_32( 0x0c0d0e0f );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
@@ -855,11 +855,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1303,11 +1303,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
const __m512i c1 = v512_32( 0x04050607 );
const __m512i c2 = v512_32( 0x08090a0b );
const __m512i c3 = v512_32( 0x0c0d0e0f );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d, 0 ) = _mm512_permutexvar_epi8( c0,
_mm512_castsi128_si512( s0 ) );
@@ -1360,11 +1360,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -1492,20 +1492,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
#if defined(__SSE2__)
casti_m128i( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_m128i( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
casti_m128i( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_m128i( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
casti_m128i( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_m128i( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
casti_m128i( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_m128i( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
casti_m128i( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_m128i( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
#elif defined(__ARM_NEON)
@@ -1719,7 +1719,7 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
{
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
v128_t s4 = casti_m128i( src,4 );
v128_t s4 = casti_v128( src,4 );
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
@@ -1747,11 +1747,11 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = v256_64( 0x0405060700010203 );
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
@@ -1783,7 +1783,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
v128_t s4 = casti_m128i( src,4 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
@@ -2162,11 +2162,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m512i c0 = v512_64( 0x0405060700010203 );
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( c0,
_mm512_castsi128_si512( s0 ) );
@@ -2197,11 +2197,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_64( 1 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2391,11 +2391,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const v128_t s0 = casti_m128i( src,0 );
const v128_t s1 = casti_m128i( src,1 );
const v128_t s2 = casti_m128i( src,2 );
const v128_t s3 = casti_m128i( src,3 );
const v128_t s4 = casti_m128i( src,4 );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
@@ -2415,11 +2415,11 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
v128_t s0 = casti_m128i( src,0 );
v128_t s1 = casti_m128i( src,1 );
v128_t s2 = casti_m128i( src,2 );
v128_t s3 = casti_m128i( src,3 );
v128_t s4 = casti_m128i( src,4 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
@@ -2489,44 +2489,44 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
const v128_t *s = (const v128_t*)src;
v128_t *d = (v128_t*)dst;
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 2] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 6] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
if ( bit_len <= 256 ) return;
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 );
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd );
d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 );
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[10] = v128_shuffle2_32( s[10], s[11], 0x88 );
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = v128_shuffle2_32( s[12], s[13], 0xdd );
d[14] = v128_shuffle2_32( s[14], s[15], 0x88 );
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
if ( bit_len <= 512 ) return;
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd );
d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 );
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd );
d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 );
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd );
d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 );
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd );
d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 );
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = v128_shuffle2_32( s[16], s[17], 0xdd );
d[18] = v128_shuffle2_32( s[18], s[19], 0x88 );
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = v128_shuffle2_32( s[20], s[21], 0xdd );
d[22] = v128_shuffle2_32( s[22], s[23], 0x88 );
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = v128_shuffle2_32( s[24], s[25], 0xdd );
d[26] = v128_shuffle2_32( s[26], s[27], 0x88 );
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = v128_shuffle2_32( s[28], s[29], 0xdd );
d[30] = v128_shuffle2_32( s[30], s[31], 0x88 );
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
// if ( bit_len <= 1024 ) return;
}
@@ -2537,77 +2537,77 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
const v128_t *s = (const v128_t*)src;
v128_t *d = (v128_t*)dst;
d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 );
d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 );
d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd );
d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd );
d[ 0] = v128_shuffle2_32( s[ 0], s[ 1], 0x88 );
d[ 1] = v128_shuffle2_32( s[ 2], s[ 3], 0x88 );
d[ 2] = v128_shuffle2_32( s[ 0], s[ 1], 0xdd );
d[ 3] = v128_shuffle2_32( s[ 2], s[ 3], 0xdd );
d[ 4] = v128_shuffle2_32( s[ 4], s[ 5], 0x88 );
d[ 5] = v128_shuffle2_32( s[ 6], s[ 7], 0x88 );
d[ 6] = v128_shuffle2_32( s[ 4], s[ 5], 0xdd );
d[ 7] = v128_shuffle2_32( s[ 6], s[ 7], 0xdd );
d[ 8] = v128_shuffle2_32( s[ 8], s[ 9], 0x88 );
d[ 9] = v128_shuffle2_32( s[10], s[11], 0x88 );
d[10] = v128_shuffle2_32( s[ 8], s[ 9], 0xdd );
d[11] = v128_shuffle2_32( s[10], s[11], 0xdd );
d[12] = v128_shuffle2_32( s[12], s[13], 0x88 );
d[13] = v128_shuffle2_32( s[14], s[15], 0x88 );
d[14] = v128_shuffle2_32( s[12], s[13], 0xdd );
d[15] = v128_shuffle2_32( s[14], s[15], 0xdd );
if ( bit_len <= 256 ) return;
d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 );
d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd );
d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 );
d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd );
d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 );
d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd );
d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 );
d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd );
d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd );
d[16] = v128_shuffle2_32( s[16], s[17], 0x88 );
d[17] = v128_shuffle2_32( s[18], s[19], 0x88 );
d[18] = v128_shuffle2_32( s[16], s[17], 0xdd );
d[19] = v128_shuffle2_32( s[18], s[19], 0xdd );
d[20] = v128_shuffle2_32( s[20], s[21], 0x88 );
d[21] = v128_shuffle2_32( s[22], s[23], 0x88 );
d[22] = v128_shuffle2_32( s[20], s[21], 0xdd );
d[23] = v128_shuffle2_32( s[22], s[23], 0xdd );
d[24] = v128_shuffle2_32( s[24], s[25], 0x88 );
d[25] = v128_shuffle2_32( s[26], s[27], 0x88 );
d[26] = v128_shuffle2_32( s[24], s[25], 0xdd );
d[27] = v128_shuffle2_32( s[26], s[27], 0xdd );
d[28] = v128_shuffle2_32( s[28], s[29], 0x88 );
d[29] = v128_shuffle2_32( s[30], s[31], 0x88 );
d[30] = v128_shuffle2_32( s[28], s[29], 0xdd );
d[31] = v128_shuffle2_32( s[30], s[31], 0xdd );
if ( bit_len <= 512 ) return;
d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 );
d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 );
d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd );
d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd );
d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 );
d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 );
d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd );
d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd );
d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 );
d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 );
d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd );
d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd );
d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 );
d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 );
d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd );
d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd );
d[32] = v128_shuffle2_32( s[32], s[33], 0x88 );
d[33] = v128_shuffle2_32( s[34], s[35], 0x88 );
d[34] = v128_shuffle2_32( s[32], s[33], 0xdd );
d[35] = v128_shuffle2_32( s[34], s[35], 0xdd );
d[36] = v128_shuffle2_32( s[36], s[37], 0x88 );
d[37] = v128_shuffle2_32( s[38], s[39], 0x88 );
d[38] = v128_shuffle2_32( s[36], s[37], 0xdd );
d[39] = v128_shuffle2_32( s[38], s[39], 0xdd );
d[40] = v128_shuffle2_32( s[40], s[41], 0x88 );
d[41] = v128_shuffle2_32( s[42], s[43], 0x88 );
d[42] = v128_shuffle2_32( s[40], s[41], 0xdd );
d[43] = v128_shuffle2_32( s[42], s[43], 0xdd );
d[44] = v128_shuffle2_32( s[44], s[45], 0x88 );
d[45] = v128_shuffle2_32( s[46], s[47], 0x88 );
d[46] = v128_shuffle2_32( s[44], s[45], 0xdd );
d[47] = v128_shuffle2_32( s[46], s[47], 0xdd );
d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 );
d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 );
d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd );
d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd );
d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 );
d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 );
d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd );
d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd );
d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 );
d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 );
d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd );
d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd );
d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 );
d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 );
d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd );
d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd );
d[48] = v128_shuffle2_32( s[48], s[49], 0x88 );
d[49] = v128_shuffle2_32( s[50], s[51], 0x88 );
d[50] = v128_shuffle2_32( s[48], s[49], 0xdd );
d[51] = v128_shuffle2_32( s[50], s[51], 0xdd );
d[52] = v128_shuffle2_32( s[52], s[53], 0x88 );
d[53] = v128_shuffle2_32( s[54], s[55], 0x88 );
d[54] = v128_shuffle2_32( s[52], s[53], 0xdd );
d[55] = v128_shuffle2_32( s[54], s[55], 0xdd );
d[56] = v128_shuffle2_32( s[56], s[57], 0x88 );
d[57] = v128_shuffle2_32( s[58], s[59], 0x88 );
d[58] = v128_shuffle2_32( s[56], s[57], 0xdd );
d[59] = v128_shuffle2_32( s[58], s[59], 0xdd );
d[60] = v128_shuffle2_32( s[60], s[61], 0x88 );
d[61] = v128_shuffle2_32( s[62], s[63], 0x88 );
d[62] = v128_shuffle2_32( s[60], s[61], 0xdd );
d[63] = v128_shuffle2_32( s[62], s[63], 0xdd );
// if ( bit_len <= 1024 ) return;
}
@@ -3248,12 +3248,21 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
// blend 2 vectors while interleaving: { hi[n], lo[n-1], ... hi[1], lo[0] }
#if defined(__SSE4_1__)
// No SSE2 implementation.
//#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
//#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
#define v128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
#define v128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
#endif // SSE4_1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define v128_intrlv_blend_64( hi, lo ) \
v128_blendv( hi, lo, v128_set64( 0ull, 0xffffffffffffffffull ) )
#define v128_intrlv_blend_32( hi, lo ) \
v128_blendv( hi, lo, v128_set64( 0xffffffffull, 0xffffffffull ) )
#else
// unknown, unsupported architecture
#endif
#if defined(__AVX2__)

View File

@@ -35,17 +35,17 @@
///////////////////////////////////////////////////////////////////////////////
// New architecturally agnostic syntax:
// All users of 128 bit SIMD should use new syntax or protect SSE2 only
// code segments.
// Other vector sizes continue with old syntax for now.
// Definitionns here will gradually be converted to new synytax.
// For consistency the larger vector utilities should do the same.
//
// __m128i -> v128_t
// _mm_ -> v128_
// mm128_ -> v128_
//
// There is also new syntax to accomodate ARM's stricter type checking of
// vector element size. They have no effect on x86_64.
// direct translation of native intrinsics
#define v128_t __m128i
// Needed for ARM
#define v128u64_t v128_t
#define v128u32_t v128_t
#define v128u16_t v128_t
@@ -56,17 +56,15 @@
// Needed for ARM, Doesn't do anything special on x86_64
#define v128_load1_64(p) _mm_set1_epi64x(*(uint64_t*)(p) )
#define v128_load1_32(p) _mm_set_epi32( *(uint32_t*)(p) )
#define v128_load1_16(p) _mm_set_epi16( *(uint16_t*)(p) )
#define v128_load1_8( p) _mm_set_epi8( *(uint8_t*) (p) )
#define v128_load1_32(p) _mm_set1_epi32( *(uint32_t*)(p) )
#define v128_load1_16(p) _mm_set1_epi16( *(uint16_t*)(p) )
#define v128_load1_8( p) _mm_set1_epi8( *(uint8_t*) (p) )
// arithmetic
#define v128_add64 _mm_add_epi64
#define v128_add32 _mm_add_epi32
#define v128_add16 _mm_add_epi16
#define v128_add8 _mm_add_epi8
#define v128_add4_64 mm128_add4_64
#define v128_add4_32 mm128_add4_32
#define v128_sub64 _mm_sub_epi64
#define v128_sub32 _mm_sub_epi32
@@ -82,7 +80,7 @@
#define v128_mulw32 _mm_mul_epu32
#define v128_mulw16 _mm_mul_epu16
// compare
// signed compare
#define v128_cmpeq64 _mm_cmpeq_epi64
#define v128_cmpeq32 _mm_cmpeq_epi32
#define v128_cmpeq16 _mm_cmpeq_epi16
@@ -120,27 +118,6 @@
#define v128_xor _mm_xor_si128
#define v128_xorq _mm_xor_si128
#define v128_andnot _mm_andnot_si128
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_ornot( a, b ) mm128_or( a, mm128_not( b ) )
// ternary
#define v128_xorandnot( v2, v1, v0 ) \
_mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) \
_mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
// shift 2 concatenated vectors right
#define v128_alignr64 mm128_alignr_64
#define v128_alignr32 mm128_alignr_32
#if defined(__SSSE3__)
#define v128_alignr8 _mm_alignr_epi8
#endif
// unpack
#define v128_unpacklo64 _mm_unpacklo_epi64
@@ -153,10 +130,16 @@
#define v128_unpackhi8 _mm_unpackhi_epi8
// AES
// Nokey means nothing on x86_64 but it saves an instruction and a register
// on ARM.
#define v128_aesenc _mm_aesenc_si128
#define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero )
#define v128_aesenclast _mm_aesenclast_si128
#define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero )
#define v128_aesdec _mm_aesdec_si128
#define v128_aesdec_nokey(v) _mm_aesdec_si128( v, v128_zero )
#define v128_aesdeclast _mm_aesdeclast_si128
#define v128_aesdeclast_nokey(v) _mm_aesdeclast_si128( v, v128_zero )
// Used instead if casting.
typedef union
@@ -237,24 +220,22 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
// Pseudo constants
#define v128_zero _mm_setzero_si128()
#define m128_zero _mm_setzero_si128()
#if defined(__SSE4_1__)
// Bitwise AND, return 1 if result is all bits clear.
#define v128_and_eq0 _mm_testz_si128
#define v128_and_eq0(v1, v0) _mm_testz_si128(v1, v0)
// v128_is_zero?
static inline int v128_cmpeq0( v128_t v )
{ return v128_and_eq0( v, v ); }
#endif
// Bitwise compare return 1 if all bits set.
#define v128_cmpeq1 _mm_test_all ones
#define v128_cmpeq1(v) _mm_test_all ones(v)
#define v128_one mm128_mov64_128( 1 )
#define m128_one_128 v128_one
#define v128_one mm128_mov64_128(1)
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -268,17 +249,14 @@ static inline __m128i v128_neg1_fn()
#endif
return a;
}
#define m128_neg1_fn v128_neg1_fn
#define v128_neg1 v128_neg1_fn()
#define m128_neg1 v128_neg1
//
// Vector pointer cast
// p = any aligned pointer
// returns p as pointer to vector type
#define castp_m128i(p) ((__m128i*)(p))
#define castp_v128 castp_m128i
#define castp_v128(p) ((__m128i*)(p))
#define castp_v128u64 castp_v128
#define castp_v128u32 castp_v128
#define castp_v128u16 castp_v128
@@ -286,8 +264,7 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m128i(p) (*((__m128i*)(p)))
#define cast_v128 cast_m128i
#define cast_v128(p) (*((__m128i*)(p)))
#define cast_v128u64 cast_v128
#define cast_v128u32 cast_v128
#define cast_v128u16 cast_v128
@@ -295,8 +272,8 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
#define casti_v128 casti_m128i
#define casti_v128(p,i) (((__m128i*)(p))[(i)])
#define casti_m128i casti_v128 // deprecated
#define casti_v128u64 casti_v128
#define casti_v128u32 casti_v128
#define casti_v128u16 casti_v128
@@ -304,7 +281,7 @@ static inline __m128i v128_neg1_fn()
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
#define casto_v128(p,o) (((__m128i*)(p))+(o))
#if defined(__SSE4_1__)
#define v128_get64( v, l ) _mm_extract_epi64( v, l )
@@ -319,7 +296,7 @@ static inline __m128i v128_neg1_fn()
/////////////////////////////////////////////////////////////
//
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
// _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
//
// Fast and powerful but very limited in its application.
// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
@@ -374,115 +351,112 @@ static inline __m128i v128_neg1_fn()
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m128i mm128_not( const __m128i v )
static inline __m128i v128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
#else
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
#define v128_not( v ) _mm_xor_si128( v, v128_neg1 )
#endif
#define v128_not mm128_not
static inline __m128i mm128_negate_64( __m128i v )
static inline v128u64_t v128_negate_64( v128u64_t v )
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
#define v128_negate64 mm128_negate_64
static inline __m128i mm128_negate_32( __m128i v )
static inline v128u32_t v128_negate_32( v128u32_t v )
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
#define v128_negate32 mm128_negate_32
static inline __m128i mm128_negate_16( __m128i v )
static inline v128u16_t v128_negate_16( v128u16_t v )
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
#define v128_negate16 mm128_negate_16
// Add 4 values, fewer dependencies than sequential addition.
#define mm128_add4_64( a, b, c, d ) \
#define v128_add4_64( a, b, c, d ) \
_mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
#define mm128_add4_32( a, b, c, d ) \
#define v128_add4_32( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
#define v128_add4_32 mm128_add4_32
#define mm128_add4_16( a, b, c, d ) \
#define v128_add4_16( a, b, c, d ) \
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
#define mm128_add4_8( a, b, c, d ) \
#define v128_add4_8( a, b, c, d ) \
_mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )
#define mm128_xor4( a, b, c, d ) \
#define v128_xor4( a, b, c, d ) \
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
// Memory functions
// Mostly for convenience, avoids calculating bytes.
// Assumes data is alinged and integral.
// n = number of __m128i, bytes/16
static inline void memset_zero_128( __m128i *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
#define v128_memset_zero memset_zero_128
static inline void v128_memset_zero( v128_t *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
#define memset_zero_128 v128_memset_zero
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
#define v128_memset memset_128
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v128_memcpy memcpy_128
#define memcpy_128 v128_memcpy
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
#define v128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
// a & b & c
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
#define v128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
#define v128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
#define v128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
#define v128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b | c )
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
#define v128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c )
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#endif
#define v128_ornot( a, b ) _mm_or_si128( a, v128_not( b ) )
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
@@ -499,79 +473,146 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Bit rotations
// Slow bit rotation, used as last resort
#define mm128_ror_64_sse2( v, c ) \
#define v128_shuffle16( v, c ) \
_mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )
#define v128_qrev32(v) _mm_shuffle_epi32( v, 0xb1 )
#define v128_swap64_32(v) _mm_shuffle_epi32( v, 0xb1 ) // grandfathered
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
// These should never be callled from application code, use rol/ror.
#define v128_ror64_sse2( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_64_sse2( v, c ) \
#define v128_rol64_sse2( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_32_sse2( v, c ) \
#define v128_ror32_sse2( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_32_sse2( v, c ) \
#define v128_rol32_sse2( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512VL__)
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
// AVX512 fastest all rotations.
#define v128_ror64 _mm_ror_epi64
#define v128_rol64 _mm_rol_epi64
#define v128_ror32 _mm_ror_epi32
#define v128_rol32 _mm_rol_epi32
// ror/rol will always find the fastest but these names may fit better with
// application code performing shuffles rather than bit rotations.
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
#define v128_shuflr64_16(v) _mm_ror_epi64( v, 16 )
#define v128_shufll64_16(v) _mm_rol_epi64( v, 16 )
#define v128_shuflr64_24(v) _mm_ror_epi64( v, 24 )
#define v128_shufll64_24(v) _mm_rol_epi64( v, 24 )
#define v128_shuflr32_8( v) _mm_ror_epi32( v, 8 )
#define v128_shufll32_8( v) _mm_rol_epi32( v, 8 )
#define v128_shuflr32_16(v) _mm_ror_epi32( v, 16 )
#define v128_shufll32_16(v) _mm_rol_epi32( v, 16 )
// optimized byte wise rotation
#elif defined(__SSSE3__)
// SSSE3: fastest 32 bit, very fast 16, fast 8
#define mm128_ror_64( v, c ) \
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) \
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) \
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) \
: mm128_ror_64_sse2( v, c )
#define v128_shuflr64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
#define mm128_rol_64( v, c ) \
( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 24 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) \
: ( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) \
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) \
: mm128_rol_64_sse2( v, c )
#define v128_shufll64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
#define mm128_ror_32( v, c ) \
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) \
: mm128_ror_32_sse2( v, c )
#define v128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#define mm128_rol_32( v, c ) \
( (c) == 16 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) \
: ( (c) == 8 ) ? _mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) \
: mm128_rol_32_sse2( v, c )
#define v128_shufll64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
#define v128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#define v128_shufll32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: v128_ror64_sse2( v, c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: v128_rol64_sse2( v, c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? v128_lrev16( v ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: v128_ror32_sse2( v, c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? v128_lrev16( v ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: v128_rol32_sse2( v, c )
#elif defined(__SSE2__)
// SSE2: fastest 32 bit, very fast 16
#define v128_ror64( v, c ) \
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
: v128_ror64_sse2( v, c )
#define v128_rol64( v, c ) \
( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
: ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
: v128_rol64_sse2( v, c )
#define v128_ror32( v, c ) \
( (c) == 16 ) ? v128_lrev16( v ) \
: v128_ror32_sse2( v, c )
#define v128_rol32( v, c ) \
( (c) == 16 ) ? v128_lrev16( v ) \
: v128_rol32_sse2( v, c )
#else
#define mm128_ror_64 mm128_ror_64_sse2
#define mm128_rol_64 mm128_rol_64_sse2
#define mm128_ror_32 mm128_ror_32_sse2
#define mm128_rol_32 mm128_rol_32_sse2
#define v128_ror64 v128_ror64_sse2
#define v128_rol64 v128_rol64_sse2
#define v128_ror32 v128_ror32_sse2
#define v128_rol32 v128_rol32_sse2
#endif
// Architecturally agnostic naming
#define v128_ror64 mm128_ror_64
#define v128_rol64 mm128_rol_64
#define v128_ror32 mm128_ror_32
#define v128_rol32 mm128_rol_32
//#define v128_ror64 mm128_ror_64
//#define v128_rol64 mm128_rol_64
//#define v128_ror32 mm128_ror_32
#define mm128_rol_32 v128_rol32
/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
@@ -579,25 +620,25 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm128_rorx2_64( v1, v0, c ) \
#define v128_2ror64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define mm128_rolx2_64( v1, v0, c ) \
#define v128_2rol64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define mm128_rorx2_32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define mm128_rolx2_32( v1, v0, c ) \
#define mm128_2rol32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define mm128_rorx2_64( v1, v0, c ) \
#define v128_2ror64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
@@ -607,7 +648,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_64( v1, v0, c ) \
#define v128_2rol64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
@@ -617,7 +658,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rorx2_32( v1, v0, c ) \
#define v128_2ror32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
@@ -627,7 +668,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
v1 = _mm_or_si256( v1, t1 ); \
}
#define mm128_rolx2_32( v1, v0, c ) \
#define v128_2rol32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
@@ -638,12 +679,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
}
#endif // AVX512 else SSE2
#define v128_2ror64 mm128_rorx2_64
#define v128_2rol64 mm128_rolx2_64
#define v128_2ror32 mm128_rorx2_32
#define v128_2rol32 mm128_rolx2_32
*/
// Cross lane shuffles
@@ -669,9 +705,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Rotate vector elements accross all lanes
#define v128_shuffle16( v, c ) \
_mm_or_si128( _mm_shufflehi_epi16( v, c ), _mm_shufflelo_epi16( v, c ) )
// reverse elements in vector
#define v128_swap64(v) _mm_shuffle_epi32( v, 0x4e ) // grandfathered
#define v128_rev64(v) _mm_shuffle_epi32( v, 0x4e ) // preferred
@@ -685,227 +718,204 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
// Some sub-vector shuffles are identical to bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage of these versions
// are context sensitive.
// reverse elements in vector lanes
#define v128_qrev32(v) v128_ror64( v, 32 )
#define v128_swap64_32(v) v128_ror64( v, 32 ) // grandfathered
#define v128_qrev16(v) \
_mm_or_si128( _mm_shufflehi_epi16( v, v128u16( 0x1b ) ) \
_mm_shufflelo_epi16( v, v128u16( 0x1b ) ) )
#define v128_lrev16(v) v128_ror32( v, 16 )
// alias bswap
#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
// reverse bits, can it be done?
//#define v128_bitrev8( v ) vrbitq_u8
/* Not used
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
*/
// Endian byte swap.
#if defined(__SSSE3__)
#define mm128_bswap_128( v ) \
#define v128_bswap128( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
0x08090a0b0c0d0e0f ) )
#define mm128_bswap_64( v ) \
#define v128_bswap64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define mm128_bswap_32( v ) \
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define mm128_bswap_32 v128_bswap32
#define mm128_bswap_16( v ) \
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_1024( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define mm128_block_bswap_32( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d,0 ) = _mm_shuffle_epi8( casti_m128i( s,0 ), ctl ); \
casti_m128i( d,1 ) = _mm_shuffle_epi8( casti_m128i( s,1 ), ctl ); \
casti_m128i( d,2 ) = _mm_shuffle_epi8( casti_m128i( s,2 ), ctl ); \
casti_m128i( d,3 ) = _mm_shuffle_epi8( casti_m128i( s,3 ), ctl ); \
casti_m128i( d,4 ) = _mm_shuffle_epi8( casti_m128i( s,4 ), ctl ); \
casti_m128i( d,5 ) = _mm_shuffle_epi8( casti_m128i( s,5 ), ctl ); \
casti_m128i( d,6 ) = _mm_shuffle_epi8( casti_m128i( s,6 ), ctl ); \
casti_m128i( d,7 ) = _mm_shuffle_epi8( casti_m128i( s,7 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define mm128_block_bswap32_128( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
}
#define v128_block_bswap32_512( d, s ) \
{ \
__m128i ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), ctl ); \
casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), ctl ); \
casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), ctl ); \
casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), ctl ); \
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), ctl ); \
casti_m128i( d, 5 ) = _mm_shuffle_epi8( casti_m128i( s, 5 ), ctl ); \
casti_m128i( d, 6 ) = _mm_shuffle_epi8( casti_m128i( s, 6 ), ctl ); \
casti_m128i( d, 7 ) = _mm_shuffle_epi8( casti_m128i( s, 7 ), ctl ); \
casti_m128i( d, 8 ) = _mm_shuffle_epi8( casti_m128i( s, 8 ), ctl ); \
casti_m128i( d, 9 ) = _mm_shuffle_epi8( casti_m128i( s, 9 ), ctl ); \
casti_m128i( d,10 ) = _mm_shuffle_epi8( casti_m128i( s,10 ), ctl ); \
casti_m128i( d,11 ) = _mm_shuffle_epi8( casti_m128i( s,11 ), ctl ); \
casti_m128i( d,12 ) = _mm_shuffle_epi8( casti_m128i( s,12 ), ctl ); \
casti_m128i( d,13 ) = _mm_shuffle_epi8( casti_m128i( s,13 ), ctl ); \
casti_m128i( d,14 ) = _mm_shuffle_epi8( casti_m128i( s,14 ), ctl ); \
casti_m128i( d,15 ) = _mm_shuffle_epi8( casti_m128i( s,15 ), ctl ); \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
#else // SSE2
static inline __m128i mm128_bswap_64( __m128i v )
static inline v128_t v128_bswap64( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
}
static inline __m128i mm128_bswap_32( __m128i v )
static inline v128_t v128_bswap32( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}
#define mm128_bswap_32 v128_bswap32
static inline __m128i mm128_bswap_16( __m128i v )
static inline v128_t v128_bswap16( __m128i v )
{
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) )
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
{
d[0] = mm128_bswap_64( s[0] );
d[1] = mm128_bswap_64( s[1] );
d[2] = mm128_bswap_64( s[2] );
d[3] = mm128_bswap_64( s[3] );
d[4] = mm128_bswap_64( s[4] );
d[5] = mm128_bswap_64( s[5] );
d[6] = mm128_bswap_64( s[6] );
d[7] = mm128_bswap_64( s[7] );
d[0] = v128_bswap64( s[0] );
d[1] = v128_bswap64( s[1] );
d[2] = v128_bswap64( s[2] );
d[3] = v128_bswap64( s[3] );
d[4] = v128_bswap64( s[4] );
d[5] = v128_bswap64( s[5] );
d[6] = v128_bswap64( s[6] );
d[7] = v128_bswap64( s[7] );
}
#define v128_block_bswap64_512 mm128_block_bswap_64
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = mm128_bswap_64( s[ 0] );
d[ 1] = mm128_bswap_64( s[ 1] );
d[ 2] = mm128_bswap_64( s[ 2] );
d[ 3] = mm128_bswap_64( s[ 3] );
d[ 4] = mm128_bswap_64( s[ 4] );
d[ 5] = mm128_bswap_64( s[ 5] );
d[ 6] = mm128_bswap_64( s[ 6] );
d[ 7] = mm128_bswap_64( s[ 7] );
d[ 8] = mm128_bswap_64( s[ 8] );
d[ 9] = mm128_bswap_64( s[ 9] );
d[10] = mm128_bswap_64( s[10] );
d[11] = mm128_bswap_64( s[11] );
d[14] = mm128_bswap_64( s[12] );
d[13] = mm128_bswap_64( s[13] );
d[14] = mm128_bswap_64( s[14] );
d[15] = mm128_bswap_64( s[15] );
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
d[ 2] = v128_bswap64( s[ 2] );
d[ 3] = v128_bswap64( s[ 3] );
d[ 4] = v128_bswap64( s[ 4] );
d[ 5] = v128_bswap64( s[ 5] );
d[ 6] = v128_bswap64( s[ 6] );
d[ 7] = v128_bswap64( s[ 7] );
d[ 8] = v128_bswap64( s[ 8] );
d[ 9] = v128_bswap64( s[ 9] );
d[10] = v128_bswap64( s[10] );
d[11] = v128_bswap64( s[11] );
d[14] = v128_bswap64( s[12] );
d[13] = v128_bswap64( s[13] );
d[14] = v128_bswap64( s[14] );
d[15] = v128_bswap64( s[15] );
}
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
{
d[0] = mm128_bswap_32( s[0] );
d[1] = mm128_bswap_32( s[1] );
d[2] = mm128_bswap_32( s[2] );
d[3] = mm128_bswap_32( s[3] );
d[4] = mm128_bswap_32( s[4] );
d[5] = mm128_bswap_32( s[5] );
d[6] = mm128_bswap_32( s[6] );
d[7] = mm128_bswap_32( s[7] );
d[0] = v128_bswap32( s[0] );
d[1] = v128_bswap32( s[1] );
d[2] = v128_bswap32( s[2] );
d[3] = v128_bswap32( s[3] );
d[4] = v128_bswap32( s[4] );
d[5] = v128_bswap32( s[5] );
d[6] = v128_bswap32( s[6] );
d[7] = v128_bswap32( s[7] );
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = mm128_bswap_32( s[ 0] );
d[ 1] = mm128_bswap_32( s[ 1] );
d[ 2] = mm128_bswap_32( s[ 2] );
d[ 3] = mm128_bswap_32( s[ 3] );
d[ 4] = mm128_bswap_32( s[ 4] );
d[ 5] = mm128_bswap_32( s[ 5] );
d[ 6] = mm128_bswap_32( s[ 6] );
d[ 7] = mm128_bswap_32( s[ 7] );
d[ 8] = mm128_bswap_32( s[ 8] );
d[ 9] = mm128_bswap_32( s[ 9] );
d[10] = mm128_bswap_32( s[10] );
d[11] = mm128_bswap_32( s[11] );
d[12] = mm128_bswap_32( s[12] );
d[13] = mm128_bswap_32( s[13] );
d[14] = mm128_bswap_32( s[14] );
d[15] = mm128_bswap_32( s[15] );
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
d[ 2] = v128_bswap32( s[ 2] );
d[ 3] = v128_bswap32( s[ 3] );
d[ 4] = v128_bswap32( s[ 4] );
d[ 5] = v128_bswap32( s[ 5] );
d[ 6] = v128_bswap32( s[ 6] );
d[ 7] = v128_bswap32( s[ 7] );
d[ 8] = v128_bswap32( s[ 8] );
d[ 9] = v128_bswap32( s[ 9] );
d[10] = v128_bswap32( s[10] );
d[11] = v128_bswap32( s[11] );
d[12] = v128_bswap32( s[12] );
d[13] = v128_bswap32( s[13] );
d[14] = v128_bswap32( s[14] );
d[15] = v128_bswap32( s[15] );
}
#endif // SSSE3 else SSE2
#define v128_bswap32 mm128_bswap_32
#define v128_bswap64 mm128_bswap_64
#define v128_bswap128 mm128_bswap_128
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
@@ -915,24 +925,20 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#if defined(__SSSE3__)
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#define v128_alignr8 _mm_alignr_epi8
#define v128_alignr64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define v128_alignr32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#else
#define mm128_alignr_64( hi, lo, c ) \
#define v128_alignr64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
#define mm128_alignr_32( hi, lo, c ) \
#define v128_alignr32( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
#endif
// NEON only uses vector mask. x86 blend selects second arg when control bit
// is set. Blendv selects second arg when sign bit is set. And masking is the
// opposite, elements are selected from the first arg if the mask bits are set.
// Arm blend is a bit by bit blend while x76 is an elenet blend.
// Reverse the logic so the use mask is consistent with both formats.
#if defined(__SSE4_1__)
#define v128_blendv _mm_blendv_epi8
@@ -940,7 +946,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#else
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
#endif

View File

@@ -90,7 +90,7 @@ typedef union
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
#define m256_one_128 mm256_bcast_m128( v128_one )
static inline __m256i mm256_neg1_fn()
{
@@ -218,7 +218,29 @@ static inline __m256i mm256_not( const __m256i v )
//
// Bit rotations.
// Slow version, used as last resort
#define mm256_shuffle16( v, c ) \
_mm256_shufflehi_epi16( _mm256_shufflelo_epi16( v, c ), c )
#define mm256_qrev32(v) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_swap64_32 mm256_qrev32 // grandfathered
#define mm256_qrev16(v) mm256_shuffle16( v, 0x1b )
#define mm256_qrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 )
#define mm256_lrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm256_wrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
// These should never be called directly by applications.
#define mm256_ror_64_avx2( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
@@ -242,40 +264,76 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
// Redundant but naming may be a better fit in some applications.
#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
#else
// ROR & ROL will always find the fastest but these names may be a better fit
// in some applications.
#define mm256_shuflr64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
#define mm256_shufll64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
#define mm256_shuflr64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
#define mm256_shufll64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
#define mm256_shuflr32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
#define mm256_shufll32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
#define mm256_ror_64( v, c ) \
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) \
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) ) \
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) \
( (c) == 8 ) ? mm256_shuflr64_8( v ) \
: ( (c) == 16 ) ? mm256_shuffle16( v, 0x39 ) \
: ( (c) == 24 ) ? mm256_shuflr64_24( v ) \
: ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 40 ) ? mm256_shufll64_24( v ) \
: ( (c) == 48 ) ? mm256_shuffle16( v, 0x93 ) \
: ( (c) == 56 ) ? mm256_shufll64_8( v ) \
: mm256_ror_64_avx2( v, c )
#define mm256_rol_64( v, c ) \
( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 24 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) \
: ( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) ) \
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) \
( (c) == 8 ) ? mm256_shufll64_8( v ) \
: ( (c) == 16 ) ? mm256_shuffle16( v, 0x93 ) \
: ( (c) == 24 ) ? mm256_shufll64_24( v ) \
: ( (c) == 32 ) ? _mm256_shuffle_epi32( v, 0xb1 ) \
: ( (c) == 40 ) ? mm256_shuflr64_24( v ) \
: ( (c) == 48 ) ? mm256_shuffle16( v, 0x39 ) \
: ( (c) == 56 ) ? mm256_shuflr64_8( v ) \
: mm256_rol_64_avx2( v, c )
#define mm256_ror_32( v, c ) \
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) )\
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) \
( (c) == 8 ) ? mm256_shuflr32_8( v ) \
: ( (c) == 16 ) ? mm256_lrev16( v ) \
: ( (c) == 24 ) ? mm256_shufll32_8( v ) \
: mm256_ror_32_avx2( v, c )
#define mm256_rol_32( v, c ) \
( (c) == 16 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) ) \
: ( (c) == 8 ) ? _mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) \
( (c) == 8 ) ? mm256_shufll32_8( v ) \
: ( (c) == 16 ) ? mm256_lrev16( v ) \
: ( (c) == 24 ) ? mm256_shuflr32_8( v ) \
: mm256_rol_32_avx2( v, c )
#endif
@@ -400,25 +458,19 @@ static inline __m256i mm256_not( const __m256i v )
/* Not used
// Rotate 256 bit vector by one 32 bit element.
#if defined(__AVX512VL__)
static inline __m256i mm256_shuflr_32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 1 ); }
static inline __m256i mm256_shufll_32( const __m256i v )
{ return _mm256_alignr_epi32( v, v, 15 ); }
#else
#define mm256_shuflr_32( v ) \
_mm256_permutevar8x32_epi32( v, \
_mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ) )
#define mm256_shufll_32( v ) \
_mm256_permutevar8x32_epi32( v, \
_mm256_set_epi64x( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ) )
#endif
*/
@@ -450,21 +502,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
*/
// Same as bit rotation but logically used as byte/word rotation.
#define mm256_swap64_32( v ) mm256_ror_64( v, 32 ) // grandfathered
#define mm256_rev64_32( v ) mm256_ror_64( v, 32 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_8(v) _mm256_ror_epi64( v, 8 )
#define mm256_shufll64_8(v) _mm256_rol_epi64( v, 8 )
#define mm256_rev32_16( v ) mm256_ror_32( v, 16 )
#define mm256_shuflr32_8(v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8(v) _mm256_rol_epi32( v, 8 )
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \

View File

@@ -1,6 +1,9 @@
#if !defined(SIMD_INT_H__)
#define SIMD_INT_H__ 1
//TODO compile time test for byte order
// be64 etc using HW bowap.
//
// Endian byte swap
#if defined(__x86_64__)
@@ -9,8 +12,6 @@
#elif defined(__aarch64__)
//#pragma message "aarch64 fast bswap"
static inline uint64_t bswap_64( uint64_t a )
{
uint64_t b;

View File

@@ -21,36 +21,36 @@
//
// vornq( v1, v0 ) or( v1, not( v0 ) )
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
#define v128u32_t uint32x4_t
#define v128u16_t uint16x8_t
#define v128u8_t uint8x16_t
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
#define v128u32_t uint32x4_t
#define v128u16_t uint16x8_t
#define v128u8_t uint8x16_t
// load & store
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
// load & set1 combined
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
#define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) )
// arithmetic
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_add4_64( v3, v2, v1, v0 ) \
vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) )
@@ -58,15 +58,15 @@
#define v128_add4_32( v3, v2, v1, v0 ) \
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// returns low half, u64 undocumented, may not exist.
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// slow, tested with argon2d
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
@@ -76,110 +76,130 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
}
// compare
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpeq8 vceqq_u8
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpeq8 vceqq_u8
#define v128_cmpeq0 vceqzq_u64
// v128_cmp0, v128_cmpz, v128 testz
#define v128_iszero vceqzq_u64
// Not yet needed
//#define v128_cmpeq1
#define v128_cmpgt64 vcgtq_u64
#define v128_cmpgt32 vcgtq_u32
#define v128_cmpgt16 vcgtq_u16
#define v128_cmpgt8 vcgtq_u8
#define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)v0 )
#define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)v0 )
#define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)v0 )
#define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)v0 )
#define v128_cmplt64 vcltq_u64
#define v128_cmplt32 vcltq_u32
#define v128_cmplt16 vcltq_u16
#define v128_cmplt8 vcltq_u8
#define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)v0 )
#define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)v0 )
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
// bit shift
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sl8 vshlq_n_u8
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sl8 vshlq_n_u8
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sr8 vshrq_n_u8
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sr8 vshrq_n_u8
// Unit tested, working.
#define v128_sra64 vshrq_n_s64
#define v128_sra32 vshrq_n_s32
#define v128_sra16 vshrq_n_s16
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c )
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c )
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c )
// unary logic
#define v128_not vmvnq_u32
#define v128_not vmvnq_u32
// binary logic
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_xor veorq_u32
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_xor veorq_u32
// ~v1 & v0
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, x86_64 convention, first arg is not'ed
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
// ternary logic
// v2 ^ v1 ^ v0
// veorq_u32 not defined
//#define v128_xor3 veor3q_u32
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
// v2 & v1 & v0
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
// v2 | v1 | v0
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
// a ^ ( ~b & c )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
// a ^ ( b & c )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
// a & ( b ^ c )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
// a ^ ( b | c )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
// v2 | ( v1 & v0 )
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
// shift 2 concatenated vectors right.
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
#define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c )
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
// Intetleave high or low half of 2 vectors.
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
// AES
// consistent with Intel AES, break up for optimizing
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
// consistent with Intel AES intrinsics, break up for optimizing
#define v128_aesenc( v, k ) \
v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
#define v128_aesenc_nokey( v ) \
vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
#define v128_aesenclast( v, k ) \
v128_xor( k, vaeseq_u8( v, v128_zero ) )
#define v128_aesenclast_nokey( v ) \
vaeseq_u8( v, v128_zero )
#define v128_aesdec( v, k ) \
v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) )
#define v128_aesdec_nokey( v ) \
vaesimcq_u8( vaesdq_u8( v, v128_zero ) )
#define v128_aesdeclast( v, k ) \
v128_xor( k, vaesdq_u8( v, v128_zero ) )
#define v128_aesdeclast_nokey( v ) \
vaesdq_u8( v, v128_zero )
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
typedef union
{
@@ -189,7 +209,7 @@ typedef union
} __attribute__ ((aligned (16))) v128_ovly;
// Broadcast lane 0 to all lanes
// Broadcast lane 0 to all lanes, consistent with x86_64 broadcast
#define v128_bcast64(v) vdupq_laneq_u64( v, 0 )
#define v128_bcast32(v) vdupq_laneq_u32( v, 0 )
#define v128_bcast16(v) vdupq_laneq_u16( v, 0 )
@@ -317,27 +337,27 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
// Bit rotation
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_rol64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)v) ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_ror32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_rol32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)v) ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_ror16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_rol16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_ror8( v, c ) \
@@ -414,6 +434,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
// preferred.

View File

@@ -15,7 +15,7 @@
#include <string.h>
#include "miner.h"
#if defined(__aarch64__)
#if defined(__aarch64__) && !defined(__APPLE__)
// for arm's "cpuid"
#include <sys/auxv.h>
#include <asm/hwcap.h>
@@ -141,26 +141,13 @@ static inline void linux_cpu_hilo_freq( float *lo, float *hi )
*lo = (float)lo_freq;
}
#else /* WIN32 */
static inline float win32_cputemp( int core )
{
// todo
return 0.0;
}
#endif /* !WIN32 */
/* exports */
static inline float cpu_temp( int core )
{
#ifdef WIN32
return win32_cputemp( core );
return 0.;
#else
return linux_cputemp( core );
#endif
@@ -321,7 +308,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
#endif
}
#elif defined(__aarch64__)
#elif defined(__aarch64__) && !defined(__APPLE__)
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
@@ -495,11 +482,9 @@ static inline bool cpu_arch_aarch64()
static inline bool has_sse()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE_Flag;
#else
return false;
#endif
@@ -508,11 +493,9 @@ static inline bool has_sse()
static inline bool has_sse2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE2_Flag;
#else
return false;
#endif
@@ -521,11 +504,9 @@ static inline bool has_sse2()
static inline bool has_ssse3()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & SSSE3_Flag;
#else
return false;
#endif
@@ -534,11 +515,9 @@ static inline bool has_ssse3()
static inline bool has_sse41()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & SSE41_Flag;
#else
return false;
#endif
@@ -547,11 +526,9 @@ static inline bool has_sse41()
static inline bool has_sse42()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & SSE42_Flag;
#else
return false;
#endif
@@ -559,7 +536,7 @@ static inline bool has_sse42()
static inline bool has_neon()
{
#if defined(__aarch64__)
#if defined(__aarch64__) && !defined(__APPLE__)
unsigned int cpu_info[4] = { 0 };
return cpu_info[0];
#else
@@ -570,7 +547,6 @@ static inline bool has_neon()
static inline bool has_aes_ni()
{
#if defined(__x86_64__)
if ( has_sse2() )
{
unsigned int cpu_info[4] = { 0 };
@@ -578,9 +554,7 @@ static inline bool has_aes_ni()
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
}
return false;
#elif defined(__aarch64__)
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
unsigned int cpu_info[4] = { 0 };
@@ -588,7 +562,6 @@ static inline bool has_aes_ni()
return cpu_info[0] & HWCAP_AES;
}
return false;
#else
return false;
#endif
@@ -597,11 +570,9 @@ static inline bool has_aes_ni()
static inline bool has_avx()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
#else
return false;
#endif
@@ -610,11 +581,9 @@ static inline bool has_avx()
static inline bool has_avx2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX2_Flag;
#else
return false;
#endif
@@ -623,7 +592,6 @@ static inline bool has_avx2()
static inline bool has_sha()
{
#if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
@@ -631,9 +599,7 @@ static inline bool has_sha()
return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false;
#elif defined(__aarch64__)
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
unsigned int cpu_info[4] = { 0 };
@@ -641,7 +607,6 @@ static inline bool has_sha()
return cpu_info[0] & HWCAP_SHA2;
}
return false;
#else
return false;
#endif
@@ -650,7 +615,6 @@ static inline bool has_sha()
static inline bool has_sha512()
{
#if defined(__x86_64__)
if ( has_avx2() )
{
unsigned int cpu_info[4] = { 0 };
@@ -658,9 +622,7 @@ static inline bool has_sha512()
return cpu_info[ EAX_Reg ] & SHA512_Flag;
}
return false;
#elif defined(__aarch64__)
#elif defined(__aarch64__) && !defined(__APPLE__)
if ( has_neon() )
{
unsigned int cpu_info[4] = { 0 };
@@ -668,7 +630,6 @@ static inline bool has_sha512()
return cpu_info[0] & HWCAP_SHA3;
}
return false;
#else
return false;
#endif
@@ -677,7 +638,6 @@ static inline bool has_sha512()
static inline bool has_avx512f()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
@@ -689,7 +649,6 @@ static inline bool has_avx512f()
static inline bool has_avx512dq()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
@@ -701,7 +660,6 @@ static inline bool has_avx512dq()
static inline bool has_avx512bw()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
@@ -713,7 +671,6 @@ static inline bool has_avx512bw()
static inline bool has_avx512vl()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
@@ -722,14 +679,13 @@ static inline bool has_avx512vl()
#endif
}
// baseline for useability
static inline bool has_avx512()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
#else
return false;
#endif
@@ -738,7 +694,6 @@ static inline bool has_avx512()
static inline bool has_vaes()
{
#if defined(__x86_64__)
if ( has_avx2() )
{
unsigned int cpu_info[4] = { 0 };
@@ -754,11 +709,9 @@ static inline bool has_vaes()
static inline bool has_vbmi()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
#else
return false;
#endif
@@ -767,7 +720,6 @@ static inline bool has_vbmi()
static inline bool has_vbmi2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
@@ -780,7 +732,6 @@ static inline bool has_vbmi2()
static inline bool has_xop()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & XOP_Flag;
@@ -792,11 +743,9 @@ static inline bool has_xop()
static inline bool has_fma3()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
#else
return false;
#endif
@@ -805,24 +754,21 @@ static inline bool has_fma3()
static inline bool has_apx_f()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EDX_Reg ] & APX_F_Flag;
#else
return false;
#endif
}
// Not much use on it's own
static inline bool has_avx10()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EDX_Reg ] & AVX10_Flag;
#else
return false;
#endif
@@ -831,7 +777,6 @@ static inline bool has_avx10()
static inline unsigned int avx10_version()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
@@ -839,7 +784,6 @@ static inline unsigned int avx10_version()
return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
}
return 0;
#else
return 0;
#endif
@@ -849,7 +793,6 @@ static inline unsigned int avx10_version()
static inline bool has_avx10_512()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
@@ -857,17 +800,15 @@ static inline bool has_avx10_512()
return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
}
return false;
#else
return false;
#endif
}
// may not include 512
// Includes 128 but may not include 512
static inline bool has_avx10_256()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
@@ -875,7 +816,6 @@ static inline bool has_avx10_256()
return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
}
return false;
#else
return false;
#endif
@@ -885,7 +825,6 @@ static inline bool has_avx10_256()
static inline unsigned int avx10_vector_length()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
@@ -894,16 +833,12 @@ static inline unsigned int avx10_vector_length()
: ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
}
return 0;
#else
return 0;
#endif
}
static inline uint32_t cpuid_get_highest_function_number()
{
#if defined(__x86_64__)
@@ -922,7 +857,7 @@ static inline void cpuid_get_highest_function( char* s )
{
#if defined(__x86_64__)
uint32_t fn = cpuid_get_highest_function_number();
uint32_t fn = cpuid_get_highest_function_number();
switch (fn)
{
case 0x16:

View File

@@ -10,12 +10,14 @@
# define some local variables
export LOCAL_LIB="$HOME/usr/lib"
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --host=x86_64-w64-mingw32"
#export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
# set correct gcc version
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
#export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
#export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
@@ -38,7 +40,7 @@ cp $MINGW_LIB/zlib1.dll release/
cp $MINGW_LIB/libwinpthread-1.dll release/
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
cp ./../libcrypto-1_1-x64.dll release/
#cp ./../libcrypto-1_1-x64.dll release/
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
# Start building...