Compare commits

..

7 Commits
v23.8 ... v24.1

Author SHA1 Message Date
Jay D Dee
4f930574cc v24.1 2024-04-16 21:31:35 -04:00
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
Jay D Dee
045b42babf v23.13 2023-11-21 14:18:15 -05:00
Jay D Dee
fc696dbbe5 v23.12 2023-11-20 11:51:57 -05:00
Jay D Dee
f3fde95f27 v23.10 2023-11-15 11:05:41 -05:00
Jay D Dee
0a78013cbe v23.9 2023-11-12 18:48:50 -05:00
115 changed files with 4681 additions and 3377 deletions

View File

@@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
cpu-miner.c \
util.c \
api.c \
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m/m7m.c \
algo/m7m/magimath.cpp \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -250,6 +250,7 @@ cpuminer_SOURCES = \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x20r.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x16/minotaur.c \
@@ -288,7 +289,7 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

View File

@@ -87,7 +87,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
- Arm CPU supporting AArch64 and NEON.
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
are not supported. FreeBSD YMMV.
32 bit CPUs are not supported.
ARM requirements (Beta):
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
CPU: Armv8 and NEON, SHA2 & AES are optional
OS: Linux distribution built for AArch64.
Packages: source code only.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
If a mobile CPU can mine it any CPU can.
See wiki for details.
@@ -73,6 +75,56 @@ If not what makes it happen or not happen?
Change Log
----------
v24.1
#414: fix bug in merkle error handling.
#416: change $nproc to $(nproc) in build scripts.
#420: change some inline function definitions to static inline.
#413: Fix formatting error for share result log when using no-color.
Faster 2 way interleaving.
Cleanup sha256 architecture targetting.
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.
Eliminated redundant hash order calculations for x16r family.
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
v23.11
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
v23.10
x86_64: Fixed scrypt, scryptn2 algos SSE2.
Fixed sha512256d algo AVX2, SSE2, NEON.
Fixed a bug in Skein N-way that reduced performance.
ARM: Skein optimized for NEON, SHA2 & SSE2.
Skein2 algo 2-way optimized for NEON & SSE2.
v23.9
x86_64: fixed minotaurx crash, broken in 23.7.
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
v23.8
Cpuminer-opt is no longer dependant on OpenSSL.

View File

@@ -368,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
case ALGO_X17: rc = register_x17_algo ( gate ); break;
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;

View File

@@ -99,7 +99,7 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
// AVX10 does not have explicit algo features:
@@ -107,16 +107,16 @@ typedef uint32_t set_t;
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
// return set contained common elements from sets a & b
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
// all elements in set a are included in set b
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
// no elements in set a are included in set b
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{

View File

@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );

View File

@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else

View File

@@ -475,11 +475,12 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr;
size_t ptr, k;
unsigned bit_len;
uint64_t th, tl;
ptr = sc->ptr;
memcpy( buf, sc->buf, ptr );
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
@@ -519,7 +520,8 @@ void blake512_close( blake512_context *sc, void *dst )
blake512_update( sc, buf, 128 );
}
v128_block_bswap64_512( dst, sc->H );
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,

View File

@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -131,47 +131,7 @@
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
v128_t *V = (v128_t*)v; \
v128_t V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))

View File

@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
*/
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm128_rol_32( (x), 4), \
mm128_rol_32( (x), 19) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 3) ), \
v128_xor( v128_rol32( (x), 4), \
v128_rol32( (x), 19) ) )
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 8), \
mm128_rol_32( (x), 23) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 8), \
v128_rol32( (x), 23) ) )
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm128_rol_32( (x), 12), \
mm128_rol_32( (x), 25) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 1) ), \
v128_xor( v128_rol32( (x), 12), \
v128_rol32( (x), 25) ) )
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 15), \
mm128_rol_32( (x), 29) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 15), \
v128_rol32( (x), 29) ) )
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
v128_xor( (x), v128_sr32( (x), 1 ) )
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
v128_xor( (x), v128_sr32( (x), 2 ) )
#define rs1(x) mm128_rol_32( x, 3 )
#define rs2(x) mm128_rol_32( x, 7 )
#define rs3(x) mm128_rol_32( x, 13 )
#define rs4(x) mm128_rol_32( x, 16 )
#define rs5(x) mm128_rol_32( x, 19 )
#define rs6(x) mm128_rol_32( x, 23 )
#define rs7(x) mm128_rol_32( x, 27 )
#define rs1(x) v128_rol32( x, 3 )
#define rs2(x) v128_rol32( x, 7 )
#define rs3(x) v128_rol32( x, 13 )
#define rs4(x) v128_rol32( x, 16 )
#define rs5(x) v128_rol32( x, 19 )
#define rs6(x) v128_rol32( x, 23 )
#define rs7(x) v128_rol32( x, 27 )
#define rol_off_32( M, j, off ) \
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
v128_xor( \
v128_add32( \
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
// resulting in some sign changes compared to the reference code.
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_add32( v128_xor( M[13], H[13] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws1 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_xor( M[11], H[11] ) ), \
v128_sub32( v128_xor( M[14], H[14] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws2 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws3 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_sub32( v128_xor( M[10], H[10] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws4 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws5 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws6 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_sub32( v128_xor( M[11], H[11] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_add32( v128_xor( M[12], H[12] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws8 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[13], H[13] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws9 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws10 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws11 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 9], H[ 9] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 9], H[ 9] ) ) )
#define Ws12 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[10], H[10] ) ) )
v128_sub32( \
v128_sub32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[10], H[10] ) ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[11], H[11] ) ) )
v128_add32( \
v128_add32( \
v128_add32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_add32( v128_xor( M[10], H[10] ), \
v128_xor( M[11], H[11] ) ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[12], H[12] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[12], H[12] ) ) )
#define Ws15 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[ 4], H[4] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[13], H[13] ) ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
{
__m128i qt[32], xl, xh; \
v128u64_t qt[32], xl, xh; \
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
qt[10] = v128_add32( ss0( Ws10), H[11] );
qt[11] = v128_add32( ss1( Ws11), H[12] );
qt[12] = v128_add32( ss2( Ws12), H[13] );
qt[13] = v128_add32( ss3( Ws13), H[14] );
qt[14] = v128_add32( ss4( Ws14), H[15] );
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm_xor_si128( xl, _mm_xor_si128(
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = v128_xor( xl, v128_xor(
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
dH[ 0] = v128_add32(
v128_xor( M[0],
v128_xor( v128_sl32( xh, 5 ),
v128_sr32( qt[16], 5 ) ) ),
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
dH[ 1] = v128_add32(
v128_xor( M[1],
v128_xor( v128_sr32( xh, 7 ),
v128_sl32( qt[17], 8 ) ) ),
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
dH[ 2] = v128_add32(
v128_xor( M[2],
v128_xor( v128_sr32( xh, 5 ),
v128_sl32( qt[18], 5 ) ) ),
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
dH[ 3] = v128_add32(
v128_xor( M[3],
v128_xor( v128_sr32( xh, 1 ),
v128_sl32( qt[19], 5 ) ) ),
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
dH[ 4] = v128_add32(
v128_xor( M[4],
v128_xor( v128_sr32( xh, 3 ),
v128_sl32( qt[20], 0 ) ) ),
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
dH[ 5] = v128_add32(
v128_xor( M[5],
v128_xor( v128_sl32( xh, 6 ),
v128_sr32( qt[21], 6 ) ) ),
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
dH[ 6] = v128_add32(
v128_xor( M[6],
v128_xor( v128_sr32( xh, 4 ),
v128_sl32( qt[22], 6 ) ) ),
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
dH[ 7] = v128_add32(
v128_xor( M[7],
v128_xor( v128_sr32( xh, 11 ),
v128_sl32( qt[23], 2 ) ) ),
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
dH[ 8] = v128_add32( v128_add32(
v128_rol32( dH[4], 9 ),
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
v128_xor( v128_sl32( xl, 8 ),
v128_xor( qt[23], qt[ 8] ) ) );
dH[ 9] = v128_add32( v128_add32(
v128_rol32( dH[5], 10 ),
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
v128_xor( v128_sr32( xl, 6 ),
v128_xor( qt[16], qt[ 9] ) ) );
dH[10] = v128_add32( v128_add32(
v128_rol32( dH[6], 11 ),
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
v128_xor( v128_sl32( xl, 6 ),
v128_xor( qt[17], qt[10] ) ) );
dH[11] = v128_add32( v128_add32(
v128_rol32( dH[7], 12 ),
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
v128_xor( v128_sl32( xl, 4 ),
v128_xor( qt[18], qt[11] ) ) );
dH[12] = v128_add32( v128_add32(
v128_rol32( dH[0], 13 ),
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
v128_xor( v128_sr32( xl, 3 ),
v128_xor( qt[19], qt[12] ) ) );
dH[13] = v128_add32( v128_add32(
v128_rol32( dH[1], 14 ),
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
v128_xor( v128_sr32( xl, 4 ),
v128_xor( qt[20], qt[13] ) ) );
dH[14] = v128_add32( v128_add32(
v128_rol32( dH[2], 15 ),
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
v128_xor( v128_sr32( xl, 7 ),
v128_xor( qt[21], qt[14] ) ) );
dH[15] = v128_add32( v128_add32(
v128_rol32( dH[3], 16 ),
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
v128_xor( v128_sr32( xl, 2 ),
v128_xor( qt[22], qt[15] ) ) );
}
static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const __m128i final_s[16] =
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = _mm_set1_epi32( iv[i] );
// sc->H[i] = v128_32( iv[i] );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
v128u64_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
v128_memcpy( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
__m128i *buf;
__m128i h1[16], h2[16], *h;
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = m128_zero;
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = v128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
compress_small( buf, (v128u64_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
casti_v128( dst, u ) = h1[v];
}
/*

View File

@@ -2,12 +2,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
#if defined(BMW512_8WAY)
void bmw512hash_8way(void *state, const void *input)
void bmw512hash_8way( void *state, const void *input )
{
bmw512_8way_context ctx;
bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
const int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
#elif defined(BMW512_4WAY)
//#ifdef BMW512_4WAY
void bmw512hash_4way(void *state, const void *input)
void bmw512hash_4way( void *state, const void *input )
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input )
{
bmw512_2x64_context ctx;
bmw512_2x64_init( &ctx );
bmw512_2x64_update( &ctx, input, 80 );
bmw512_2x64_close( &ctx, state );
}
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
v128_bswap32_intrlv80_2x64( vdata, pdata );
do {
*noncev = v128_intrlv_blend_32( v128_bswap32(
v128_set32( n+1, 0, n, 0 ) ), *noncev );
bmw512hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -2,7 +2,7 @@
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 256.0;
#if defined (BMW512_8WAY)
gate->scanhash = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
#elif defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#elif defined (BMW512_2WAY)
gate->scanhash = (void*)&scanhash_bmw512_2x64;
gate->hash = (void*)&bmw512hash_2x64;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;

View File

@@ -8,19 +8,27 @@
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define BMW512_2WAY 1
#endif
#if defined(BMW512_8WAY)
void bmw512hash_8way( void *state, const void *input );
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input );
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else

View File

@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
}
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
{
int i, j;
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
return SUCCESS;
}
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
HashReturn update_echo( hashState_echo *state, const void *data,
uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
return SUCCESS;
}
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
HashReturn final_echo( hashState_echo *state, void *hashval)
{
v128_t remainingbits;
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
return SUCCESS;
}
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
return SUCCESS;
}
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength datalen )
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t datalen )
{
int i, j;
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
data, state->uBlockLength - state->uBufferBytes );
// Process buffer
Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
if( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
memcpy(state->buffer, data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
#if 0
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif
#endif

View File

@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
HashReturn reinit_echo(hashState_echo *state);
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
HashReturn final_echo(hashState_echo *state, void *hashval);
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength databitlen );
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen );
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t databitlen );
#endif // HASH_API_H

View File

@@ -36,7 +36,6 @@
#include "sph_echo.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif
#endif // !AES

View File

@@ -36,8 +36,6 @@
#ifndef SPH_ECHO_H__
#define SPH_ECHO_H__
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
#ifdef __cplusplus
}
#endif
#endif // !AES
#endif

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
ctx->uBlockLength = 4;
for(i = 0; i < 6; i++)
ctx->state[i] = m128_zero;
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,21 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -301,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -329,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -434,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -444,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -551,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -580,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -35,8 +35,6 @@
#include "sph_groestl.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif // !AES
#endif

View File

@@ -42,7 +42,6 @@ extern "C"{
#include <stddef.h>
#include "compat/sph_types.h"
#if !defined(__AES__)
/**
* Output size (in bits) for Groestl-224.
*/
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
}
#endif
#endif // !AES
#endif

View File

@@ -38,7 +38,7 @@
#include <stddef.h>
#include "simd-utils.h"
// SSE2 or NEON Hamsi-512 2x64
#if defined(__SSE4_2__) || defined(__ARM_NEON)
typedef struct
{
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
size_t len );
void hamsi512_2x64( void *dst, const void *data, size_t len );
#endif
#if defined (__AVX2__)
// Hamsi-512 4x64

View File

@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_2WAY)
void keccakhash_2x64(void *state, const void *input)
{
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
keccakhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ))
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
bool register_sha3d_algo( algo_gate_t* gate )
{
hard_coded_eb = 6;
// opt_extranonce = false;
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
#if defined (KECCAK_8WAY)
#if defined (SHA3D_8WAY)
gate->scanhash = (void*)&scanhash_sha3d_8way;
gate->hash = (void*)&sha3d_hash_8way;
#elif defined (KECCAK_4WAY)
#elif defined (SHA3D_4WAY)
gate->scanhash = (void*)&scanhash_sha3d_4way;
gate->hash = (void*)&sha3d_hash_4way;
#elif defined (SHA3D_2WAY)
gate->scanhash = (void*)&scanhash_sha3d_2x64;
gate->hash = (void*)&sha3d_hash_2x64;
#else
gate->scanhash = (void*)&scanhash_sha3d;
gate->hash = (void*)&sha3d_hash;

View File

@@ -8,6 +8,16 @@
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA3D_2WAY 1
#endif
extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
void keccakhash_8way( void *state, const void *input );
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_2WAY)
void keccakhash_2x64( void *state, const void *input );
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void keccakhash( void *state, const void *input );
int scanhash_keccak( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA3D_8WAY)
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64( void *state, const void *input );
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha3d_hash( void *state, const void *input );
int scanhash_sha3d( struct work *work, uint32_t max_nonce,

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)
#if defined(SHA3D_8WAY)
void sha3d_hash_8way(void *state, const void *input)
{
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_4WAY)
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way(void *state, const void *input)
{
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, buffer );
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, buffer, 32 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
sha3d_hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -75,16 +75,16 @@
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
v128_t t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a1 = v128_xnor( a1, a0 ); \
a0 = t; \
}

View File

@@ -465,12 +465,8 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
//#if defined(__x86_64__)
skein256_2x64_context skein;
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );

View File

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -21,7 +21,7 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n( double xt )
static inline double exp_n( double xt )
{
if ( xt < -700.0 )
return 0;
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
return exp( xt );
}
inline double exp_n2( double x1, double x2 )
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;

View File

@@ -1,75 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <iostream>
#include <cfloat>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "magimath.h"
#define EPS1 (std::numeric_limits<double>::epsilon())
#define EPS2 3.0e-11
static void gauleg(double x1, double x2, double x[], double w[], const int n)
{
int m,j,i;
double z1, z, xm, xl, pp, p3, p2, p1;
m=(n+1)/2;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for (i=1;i<=m;i++) {
z=cos(3.141592654*(i-0.25)/(n+0.5));
do {
p1=1.0;
p2=0.0;
for (j=1;j<=n;j++) {
p3=p2;
p2=p1;
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
}
pp=n*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > EPS2);
x[i]=xm-xl*z;
x[n+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[n+1-i]=w[i];
}
}
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
{
double s=0.0;
#ifdef _MSC_VER
#define SW_DIVS 23
double x[SW_DIVS+1], w[SW_DIVS+1];
#else
double x[NptGQ+1], w[NptGQ+1];
#endif
gauleg(a2, b2, x, w, NptGQ);
for (int j=1; j<=NptGQ; j++) {
s += w[j]*func(x[j]);
}
return s;
}
static double swit_(double wvnmb)
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
}
uint32_t sw_(int nnounce, int divs)
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
}

View File

@@ -1,54 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef MAGI_MATH_H
#define MAGI_MATH_H
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t sw_(int nnounce, int divs);
#ifdef __cplusplus
}
#endif
inline double exp_n(double xt)
{
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
if(xt < p1)
return 0;
else if(xt > p6)
return 1e200;
else if(xt > p3 && xt < p4)
return (1.0 + xt);
else
return exp(xt);
}
// 1 / (1 + exp(x1-x2))
inline double exp_n2(double x1, double x2)
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
return ( exp(-xt) );
else //if (xt > p6 - 1.e-200)
return 0.;
}
#endif

View File

@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif

View File

@@ -4,367 +4,273 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake1, blake2;
sph_bmw512_context bmw1, bmw2, bmw3;
sph_skein512_context skein1, skein2;
sph_jh512_context jh1, jh2;
sph_keccak512_context keccak1, keccak2;
hashState_luffa luffa1, luffa2;
cubehashParam cube;
sph_shavite512_context shavite1, shavite2;
#if defined(__aarch64__)
sph_simd512_context simd1, simd2;
#else
hashState_sd simd1, simd2;
#endif
sph_hamsi512_context hamsi1;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
sph_sha512_context sha1, sha2;
sph_haval256_5_context haval1, haval2;
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
hashState_fugue fugue1, fugue2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
sph_fugue512_context fugue1, fugue2;
#endif
} hmq1725_ctx_holder;
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
void init_hmq1725_ctx()
union _hmq1725_ctx_holder
{
sph_blake512_init(&hmq1725_ctx.blake1);
sph_blake512_init(&hmq1725_ctx.blake2);
sph_bmw512_init(&hmq1725_ctx.bmw1);
sph_bmw512_init(&hmq1725_ctx.bmw2);
sph_bmw512_init(&hmq1725_ctx.bmw3);
sph_skein512_init(&hmq1725_ctx.skein1);
sph_skein512_init(&hmq1725_ctx.skein2);
sph_jh512_init(&hmq1725_ctx.jh1);
sph_jh512_init(&hmq1725_ctx.jh2);
sph_keccak512_init(&hmq1725_ctx.keccak1);
sph_keccak512_init(&hmq1725_ctx.keccak2);
init_luffa( &hmq1725_ctx.luffa1, 512 );
init_luffa( &hmq1725_ctx.luffa2, 512 );
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
sph_shavite512_init(&hmq1725_ctx.shavite1);
sph_shavite512_init(&hmq1725_ctx.shavite2);
#if defined(__aarch64__)
sph_simd512_init(&hmq1725_ctx.simd1);
sph_simd512_init(&hmq1725_ctx.simd2);
#else
init_sd( &hmq1725_ctx.simd1, 512 );
init_sd( &hmq1725_ctx.simd2, 512 );
#endif
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
#if defined(__AES__)
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_init(&hmq1725_ctx.fugue1);
sph_fugue512_init(&hmq1725_ctx.fugue2);
sph_fugue512_context fugue;
#endif
sph_shabal512_init(&hmq1725_ctx.shabal1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
sph_sha512_init( &hmq1725_ctx.sha1 );
sph_sha512_init( &hmq1725_ctx.sha2 );
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
}
void hmq_bmw512_midstate( const void* input )
{
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
sph_bmw512( &hmq_bmw_mid, input, 64 );
}
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha;
sph_haval256_5_context haval;
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
uint32_t hashA[32] __attribute__((aligned(32)));
uint32_t hashB[32] __attribute__((aligned(32)));
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hashA ); //1
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
sph_groestl512_close( &ctx.groestl, hashA ); //2
#endif
}
else
{
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
sph_skein512_close(&h_ctx.skein1, hashA); //2
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hashB, 64 ); //1
sph_skein512_close( &ctx.skein, hashA ); //2
}
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
sph_jh512_close(&h_ctx.jh1, hashB); //4
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashA, 64 ); //3
sph_jh512_close( &ctx.jh, hashB ); //4
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
sph_keccak512_close( &ctx.keccak, hashA ); //3
if ( hashA[0] & mask ) //4
{
sph_blake512 (&h_ctx.blake1, hashA, 64); //
sph_blake512_close(&h_ctx.blake1, hashB); //5
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
}
else
{
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
sph_bmw512_close( &ctx.bmw, hashB ); //5
}
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
if ( hashB[0] & mask ) //7
{
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //
sph_keccak512_close( &ctx.keccak, hashA ); //8
}
else
{
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
sph_jh512_close(&h_ctx.jh2, hashA); //8
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashB, 64 ); //7
sph_jh512_close( &ctx.jh, hashA ); //8
}
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
sph_shavite512_close( &ctx.shavite, hashB ); //4
#if defined(__aarch64__)
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
sph_simd512_close(&h_ctx.simd1, hashA); //4
#else
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#endif
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
if ( hashA[0] & mask ) //4
{
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
else
{
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset(&hashB[8], 0, 32);
}
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashB, 64 ); //5
sph_echo512_close( &ctx.echo, hashA ); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
sph_blake512_close(&h_ctx.blake2, hashB); //7
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
if ( hashB[0] & mask ) //7
{
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashB, 64 ); //
sph_shavite512_close( &ctx.shavite, hashA ); //8
}
else
{
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
}
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
fugue512_Final( &h_ctx.fugue1, hashA ); //3
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
sph_fugue512_close( &ctx.fugue, hashA ); //3
#endif
if ( hashA[0] & mask ) //4
{
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashA, 64 ); //
sph_echo512_close( &ctx.echo, hashB ); //5
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&h_ctx.simd2, hashA, 64); //6
sph_simd512_close(&h_ctx.simd2, hashB); //7
#else
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
sph_shabal512_close( &ctx.shabal, hashA ); //6
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
fugue512_Final( &h_ctx.fugue2, hashA ); //8
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //
sph_fugue512_close( &ctx.fugue, hashA ); //8
#endif
}
else
{
sph_sha512( &h_ctx.sha1, hashB, 64 );
sph_sha512_close( &h_ctx.sha1, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
sph_groestl512_close( &ctx.groestl, hashB ); //4
#endif
sph_sha512( &h_ctx.sha2, hashB, 64 );
sph_sha512_close( &h_ctx.sha2, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
if ( hashA[0] & mask ) //4
{
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
memset(&hashB[8], 0, 32);
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset( &hashB[8], 0, 32 );
}
else
{
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
sph_bmw512_close( &ctx.bmw, hashA ); //6
memcpy(state, hashA, 32);
memcpy( state, hashA, 32 );
}
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

View File

@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,13 +8,9 @@
#include <stdio.h>
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
#ifdef __AES__
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
init_luffa(&qubit_ctx.luffa,512);
cubehashInit(&qubit_ctx.cubehash,512,16,32);
sph_shavite512_init(&qubit_ctx.shavite);
#if defined(__aarch64__)
sph_simd512_init( &qubit_ctx.simd );
#else
init_sd( &qubit_ctx.simd, 512 );
#endif
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_echo(&qubit_ctx.echo, 512);
#else
sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
#ifdef __AES__
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512 );
#else

View File

@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
c = v128_rol32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \

View File

@@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \

View File

@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; memcpy_128( W, data, 16 );
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );

View File

@@ -1,6 +1,6 @@
#include "sha256-hash.h"
#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) )
static const uint32_t SHA256_IV[8] =
{
@@ -189,7 +189,7 @@ static const uint32_t SHA256_IV[8] =
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
}
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
@@ -197,7 +197,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
#undef load_msg
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
@@ -517,7 +517,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input,
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
}
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -526,7 +526,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
#undef load_msg
}
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -541,7 +541,7 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
// The goal is to avoid any redundant processing in final. Prehash is almost
// 4 rounds total, only missing the final addition of the nonce.
// Nonce must be set to zero for prehash.
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate )
{
__m128i STATE0, STATE1, MSG, TMP;
@@ -569,7 +569,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
casti_m128i( ostate, 1 ) = STATE1;
}
void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y )
@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );

View File

@@ -5,27 +5,21 @@
#include "simd-utils.h"
#include "cpuminer-config.h"
// generic interface
static const uint32_t SHA256_IV[8];
#if defined(__x86_64__) && defined(__SHA__)
typedef struct
{
unsigned char buf[64]; /* first field, for alignment */
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
static const uint32_t SHA256_IV[8];
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__x86_64__) && defined(__SHA__)
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
@@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_opt_transform_le sha256_x86_sha_transform_le
#define sha256_opt_transform_be sha256_x86_sha_transform_be
#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le
#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be
#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds
#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_x86_sha_transform_le
#define sha256_transform_be sha256_x86_sha_transform_be
@@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
// SHA-256 AArch64 with NEON & SHA2
typedef struct
{
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
@@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
@@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
#else
// without HW acceleration...
#include "sph_sha2.h"
#define sha256_context sph_sha256_context
#define sha256_full sph_sha256_full
#define sha256_ctx_init sph_sha256_init
#define sha256_update sph_sha256
@@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_transform_be sph_sha256_transform_be
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-256 16 way
// SHA-256 16 way x86_64
typedef struct
{
@@ -147,7 +140,7 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_init sha256_16x32_init
#define sha256_16way_update sha256_16x32_update
#define sha256_16way_close sha256_16x32_close
@@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
#if defined (__AVX2__)
// SHA-256 8 way
// SHA-256 8 way x86_64
typedef struct
{
@@ -201,7 +194,7 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
#endif // AVX2
// SHA-256 4 way
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
{

View File

@@ -5,11 +5,11 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA512256D_8WAY 1
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#define SHA512256D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA512256D_2WAY 1
#define SHA512256D_2WAY 1
#endif
#if defined(SHA512256D_8WAY)
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, four );
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
const v128_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
bool register_sha512256d_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA512256D_8WAY)
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)

View File

@@ -34,8 +34,6 @@
#include <string.h>
#include "shabal-hash-4way.h"
//#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define DECL_STATE16 \
@@ -47,8 +45,6 @@
C8, C9, CA, CB, CC, CD, CE, CF; \
__m512i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m512i FIVE = v512_32( 5 ); \
const __m512i THREE = v512_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
mm512_swap1024_512( BF, CF ); \
} while (0)
static inline __m512i v512_mult_x3( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
}
static inline __m512i v512_mult_x5( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
}
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
_mm512_mullo_epi32( mm512_xor3( xa0, xc, \
_mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
} while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = v256_32( 5 ); \
const __m256i THREE = v256_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
mm256_swap512_256( BF, CF ); \
} while (0)
static inline __m256i v256_mult_x3( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
}
static inline __m256i v256_mult_x5( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
}
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // AVX2
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
#define DECL_STATE \
v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
const v128u32_t FIVE = v128_32( 5 ); \
const v128u32_t THREE = v128_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE( state ) \
@@ -1479,12 +1491,22 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
v128_swap256_128( BF, CF ); \
}
static inline v128_t v128_mult_x3( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 1 ) );
}
static inline v128_t v128_mult_x5( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 2 ) );
}
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
{ \
xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
v128_mul32( v128_xor3( xa0, xc, \
v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
v128_mult_x3( v128_xor3( xa0, xc, \
v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
}

View File

@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
#endif
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct {
v128_t buf[16] __attribute__ ((aligned (64)));

View File

@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
const v128_t zero = v128_zero;
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
static __thread skein512_2x64_context skein512_2x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_2x64( void *state, const void *input )
{
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
uint32_t hash0[16] __attribute__ ((aligned (32)));
uint32_t hash1[16] __attribute__ ((aligned (32)));
skein512_2x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
dintrlv_2x64( hash0, hash1, vhash64, 512 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
intrlv_2x32( state, hash0, hash1, 256 );
}
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
uint32_t hash[8*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<1]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128u32_t *noncev = (v128u32_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skeinhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
{
extr_lane_2x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -3,16 +3,20 @@
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined(SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#elif defined(SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#elif defined(SKEIN_2WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_2x64;
gate->hash = (void*)&skeinhash_2x64;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
gate->hash = (void*)&skein2hash_8way;
#elif defined (SKEIN_4WAY)
#elif defined(SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
#elif defined(SKEIN_2WAY)
gate->scanhash = (void*)&scanhash_skein2_2x64;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
return true;
};

View File

@@ -7,6 +7,8 @@
#define SKEIN_8WAY 1
#elif defined(__AVX2__)
#define SKEIN_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SKEIN_2WAY 1
#endif
#if defined(SKEIN_8WAY)
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#elif defined(SKEIN_2WAY)
void skeinhash_2x64( void *output, const void *input );
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void skein2hash_2x64( void *output, const void *input );
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#else
void skeinhash( void *output, const void *input );

View File

@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
}
memset_zero_512( buf, buf_size >> 3 );
bcount = 0;
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
}
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
// Close
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
if ( ptr )
{
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
}
v128_memset_zero( buf, buf_size >> 3 );
bcount = 0;

View File

@@ -5,19 +5,6 @@
#if defined(SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx;
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
#elif defined(SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
uint64_t hash[16*4] __attribute__ ((aligned (64)));
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
skein512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
skein512_2x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,369 +0,0 @@
#include "Swifftx_sha3.h"
extern "C" {
#include "SWIFFTX.h"
}
#include <math.h>
#include <stdlib.h>
#include <string.h>
// The default salt value.
// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
// 8281828459045235360.
// The above in base 256, from MSB to LSB:
BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
// All the IVs here below were produced from the decimal digits of e's expansion.
// The code can be found in 'ProduceRandomIV.c'.
// The initial value for 224 digest size.
const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{37, 242, 132, 2, 167, 81, 158, 237, 113, 77, 162, 60, 65, 236, 108, 246,
101, 72, 190, 109, 58, 205, 99, 6, 114, 169, 104, 114, 38, 146, 121, 142,
59, 98, 233, 84, 72, 227, 22, 199, 17, 102, 198, 145, 24, 178, 37, 1,
215, 245, 66, 120, 230, 193, 113, 253, 165, 218, 66, 134, 49, 231, 124, 204,
0};
// The initial value for 256 digest size.
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{250, 50, 42, 40, 14, 233, 53, 48, 227, 42, 237, 187, 211, 120, 209, 234,
27, 144, 4, 61, 243, 244, 29, 247, 37, 162, 70, 11, 231, 196, 53, 6,
193, 240, 94, 126, 204, 132, 104, 46, 114, 29, 3, 104, 118, 184, 201, 3,
57, 77, 91, 101, 31, 155, 84, 199, 228, 39, 198, 42, 248, 198, 201, 178,
8};
// The initial value for 384 digest size.
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{40, 145, 193, 100, 205, 171, 47, 76, 254, 10, 196, 41, 165, 207, 200, 79,
109, 13, 75, 201, 17, 172, 64, 162, 217, 22, 88, 39, 51, 30, 220, 151,
133, 73, 216, 233, 184, 203, 77, 0, 248, 13, 28, 199, 30, 147, 232, 242,
227, 124, 169, 174, 14, 45, 27, 87, 254, 73, 68, 136, 135, 159, 83, 152,
0};
// The initial value for 512 digest size.
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{195, 126, 197, 167, 157, 114, 99, 126, 208, 105, 200, 90, 71, 195, 144, 138,
142, 122, 123, 116, 24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138, 42,
114, 118, 132, 33, 35, 149, 143, 163, 163, 183, 243, 175, 72, 22, 201, 255,
102, 243, 22, 187, 211, 167, 239, 76, 164, 70, 80, 182, 181, 212, 9, 185,
0};
///////////////////////////////////////////////////////////////////////////////////////////////
// NIST API implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
int Swifftx::Init(int hashbitlen)
{
switch(hashbitlen)
{
case 224:
swifftxState.hashbitlen = hashbitlen;
// Initializes h_0 in HAIFA:
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 256:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 384:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 512:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
default:
return BAD_HASHBITLEN;
}
swifftxState.wasUpdated = false;
swifftxState.remainingSize = 0;
memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// Initialize the salt with the default value.
memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
InitializeSWIFFTX();
return SUCCESS;
}
int Swifftx::Update(const BitSequence *data, DataLength databitlen)
{
// The size of input in bytes after putting the remaining data from previous invocation.
int sizeOfInputAfterRemaining = 0;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// Whether we handled a single block.
bool wasSingleBlockHandled = false;
swifftxState.wasUpdated = true;
// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
// (but of course uses the output of the compression function from the previous round,
// which is called h_{i-1} in HAIFA article), we have to do nothing here.
if (databitlen == 0)
return SUCCESS;
// If we had before an input with unaligned length, return an error
if (swifftxState.remainingSize % 8)
{
return INPUT_DATA_NOT_ALIGNED;
}
// Convert remaining size to bytes.
swifftxState.remainingSize /= 8;
// As long as we have enough data combined from (remaining + data) to fill input block
//NASTAVENIE RUND
while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
{
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2. The input part of the block:
// 2a. The remaining data from the previous 'Update()' call:
if (swifftxState.remainingSize)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
swifftxState.remainingSize);
// 2b. The input data that we have place for after the 'remaining':
sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE
- SWIF_HAIFA_SALT_SIZE;
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize,
data, sizeOfInputAfterRemaining);
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
wasSingleBlockHandled = true;
data += sizeOfInputAfterRemaining;
databitlen -= (sizeOfInputAfterRemaining * 8);
swifftxState.remainingSize = 0;
}
// Update the swifftxState.remaining and swifftxState.remainingSize.
// remainingSize will be in bits after exiting 'Update()'.
if (wasSingleBlockHandled)
{
swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
if (swifftxState.remainingSize)
memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
}
else
{
memcpy(swifftxState.remaining + swifftxState.remainingSize, data,
(size_t) (databitlen + 7) / 8);
swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
}
return SUCCESS;
}
int Swifftx::Final(BitSequence *hashval)
{
int i;
// Whether to add one last block. True if the padding appended to the last block overflows
// the block size.
bool toAddFinalBlock = false;
bool toPutOneInFinalBlock = false;
unsigned short oneShift = 0;
// The size of the last input block before the zeroes padding. We add 1 here because we
// include the final '1' bit in the calculation and 7 as we round the length to bytes.
unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
// The number of bytes of zero in the padding part.
// The padding contains:
// 1. A single 1 bit.
// 2. As many zeroes as needed.
// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
// 4. The digest size. Maximum is 512, so we need 2 bytes.
// If the total number achieved is negative, add an additional block, as HAIFA specifies.
short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2
- SWIF_HAIFA_SALT_SIZE;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// The message length in base 256.
BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
// The digest size used for padding:
unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
if (numOfZeroBytesInPadding < 1)
toAddFinalBlock = true;
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
// call, if exists and an extra '1' bit (maybe all we have is this extra 1):
// Add the last 1 in big-endian convention ...
if (swifftxState.remainingSize % 8 == 0)
{
swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
}
else
{
swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
}
if (sizeOfLastInputBlock)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
sizeOfLastInputBlock);
// Compute the message length in base 256:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
messageLengthChar[i] = swifftxState.numOfBitsChar[i];
if (sizeOfLastInputBlock)
AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
if (!toAddFinalBlock)
{
// 2b. Put the zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, numOfZeroBytesInPadding);
// 2c. Pad the message length:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + i] = messageLengthChar[i];
// 2d. Pad the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
}
else
{
// 2b. Put the zeroes, if at all:
if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
{
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
}
}
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock);
// If we have to add one more block, it is now:
if (toAddFinalBlock)
{
// 1. The previous output block, as usual.
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. Instead of the input, zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0,
SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
// 2b. Instead of the input, the message length:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
- SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
messageLengthChar,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 2c. Instead of the input, the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
// 3. The #bits part of the block, which is zero in case of additional block:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
0,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true);
}
// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
// first hashbitlen of them:
for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
hashval[i] = swifftxState.currOutputBlock[i];
return SUCCESS;
}
int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval)
{
int result;
//hashState state;
// The pointer to the current place in the input we take into the compression function.
DataLength currInputIndex = 0;
result = Swifftx::Init(hashbitlen);
if (result != SUCCESS)
return result;
for ( ; (databitlen / 8) > SWIF_HAIFA_INPUT_BLOCK_SIZE;
currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
{
result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
if (result != SUCCESS)
return result;
}
// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
result = Swifftx::Update(data + currInputIndex, databitlen);
if (result != SUCCESS)
{
return result;
}
return Swifftx::Final(hashval);
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Helper fuction implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE],
unsigned short toAdd)
{
unsigned char remainder = 0;
short i;
BitSequence currValueInBase256[8] = {0};
unsigned short currIndex = 7;
unsigned short temp = 0;
do
{
remainder = toAdd % 256;
currValueInBase256[currIndex--] = remainder;
toAdd -= remainder;
toAdd /= 256;
}
while(toAdd != 0);
for (i = 7; i >= 0; --i)
{
temp = value[i] + currValueInBase256[i];
if (temp > 255)
{
value[i] = temp % 256;
currValueInBase256[i - 1]++;
}
else
value[i] = (unsigned char) temp;
}
}

View File

@@ -1,79 +0,0 @@
#ifndef SWIFFTX_SHA3_H
#define SWIFFTX_SHA3_H
#include "sha3_interface.h"
#include "stdbool.h"
#include "stdint.h"
class Swifftx : public SHA3 {
#define SWIFFTX_INPUT_BLOCK_SIZE 256
#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
#define SWIF_HAIFA_SALT_SIZE 8
#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
- SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
typedef unsigned char BitSequence;
//const DataLength SWIF_SALT_VALUE;
#define SWIF_HAIFA_IV 0
/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
typedef enum
{
SUCCESS = 0,
FAIL = 1,
BAD_HASHBITLEN = 2,
BAD_SALT_SIZE = 3,
SET_SALT_VALUE_FAILED = 4,
INPUT_DATA_NOT_ALIGNED = 5
} HashReturn;
typedef struct hashState {
unsigned short hashbitlen;
// The data remained after the recent call to 'Update()'.
BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
// The size of the remaining data in bits.
// Is 0 in case there is no remaning data at all.
unsigned int remainingSize;
// The current output of the compression function. At the end will contain the final digest
// (which may be needed to be truncated, depending on hashbitlen).
BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
// The value of '#bits hashed so far' field in HAIFA, in base 256.
BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
// The salt value currently in use:
BitSequence salt[SWIF_HAIFA_SALT_SIZE];
// Indicates whether a single 'Update()' occured.
// Ater a call to 'Update()' the key and the salt values cannot be changed.
bool wasUpdated;
} hashState;
private:
int swifftxNumRounds;
hashState swifftxState;
public:
int Init(int hashbitlen);
int Update(const BitSequence *data, DataLength databitlen);
int Final(BitSequence *hashval);
int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval);
private:
static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
};
#endif

View File

@@ -1,21 +0,0 @@
#pragma once
#include <cstdint>
namespace hash {
using BitSequence = unsigned char;
using DataLength = unsigned long long;
struct hash_interface {
virtual ~hash_interface() = default;
virtual int Init(int hash_bitsize) = 0;
virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
virtual int Final(BitSequence *hash) = 0;
virtual int
Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
};
} // namespace hash

View File

@@ -1,14 +0,0 @@
#pragma once
#include <cstdint>
//#include <streams/hash/hash_interface.h>
#include "hash_interface.h"
namespace sha3 {
using BitSequence = hash::BitSequence;
using DataLength = hash::DataLength;
struct sha3_interface : hash::hash_interface {};
} // namespace sha3

View File

@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#define MULXOR \
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
/ VH_BYTE_ALIGNMENT ) + 1;
#if defined (__AVX2__)
const __m256i k = _mm256_set1_epi32( 0x1000193 );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
const v128u32_t k = v128_32( 0x1000193 );
#endif

View File

@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -16,14 +16,14 @@ bool register_timetravel_algo( algo_gate_t* gate )
return true;
};
inline void tt_swap( int *a, int *b )
static inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
static inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{

View File

@@ -16,14 +16,14 @@ bool register_timetravel10_algo( algo_gate_t* gate )
return true;
};
inline void tt10_swap( int *a, int *b )
static inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
static inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{

View File

@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X11GOST_2WAY)
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
union _x11gost_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
jh512_2x64_context jh;
keccak512_2x64_context keccak;
skein512_2x64_context skein;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_gost512_context gost;
};
typedef union _x11gost_context_overlay x11gost_context_overlay;
int x11gost_2x64_hash( void *state, const void *input, int thr_id )
{
uint8_t vhash[80*2] __attribute__((aligned(64)));
uint8_t hash0[64] __attribute__((aligned(64)));
uint8_t hash1[64] __attribute__((aligned(64)));
x11gost_context_overlay ctx;
intrlv_2x64( vhash, input, input+80, 640 );
blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
bmw512_2x64_init( &ctx.bmw );
bmw512_2x64_update( &ctx.bmw, vhash, 64 );
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash0, 64 );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash1, 64 );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
intrlv_2x64( vhash, hash0, hash1, 512 );
skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash0, 64 );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash1, 64 );
sph_echo512_close( &ctx.echo, hash1 );
#endif
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
return 1;
}
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*2] __attribute__((aligned(64)));
uint32_t edata[20*2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
memcpy( edata+20, edata, 80 );
do
{
edata[19] = n;
edata[39] = n+1;
if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}
}
n += 2;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
#endif

View File

@@ -2,20 +2,24 @@
bool register_x11gost_algo( algo_gate_t* gate )
{
#if defined (X11GOST_8WAY)
#if defined(X11GOST_8WAY)
init_x11gost_8way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_8way;
gate->hash = (void*)&x11gost_8way_hash;
#elif defined (X11GOST_4WAY)
#elif defined(X11GOST_4WAY)
init_x11gost_4way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_4way;
gate->hash = (void*)&x11gost_4way_hash;
#elif defined(X11GOST_2WAY)
gate->scanhash = (void*)&scanhash_x11gost_2x64;
gate->hash = (void*)&x11gost_2x64_hash;
#else
init_x11gost_ctx();
gate->scanhash = (void*)&scanhash_x11gost;
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,6 +8,8 @@
#define X11GOST_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X11GOST_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X11GOST_2WAY 1
#endif
bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x11gost_4way_ctx();
#elif defined(X11GOST_2WAY)
int x11gost_2x64_hash( void *state, const void *input, int thr_id );
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x11gost_hash( void *state, const void *input );

View File

@@ -1,6 +1,8 @@
#include "x11gost-gate.h"
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
// no longer used, not working when last used.
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)
#include <stdlib.h>
#include <stdint.h>

View File

@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
fugue512_full( &ctx.fugue, hash1, hash1, 64 );

View File

@@ -23,13 +23,12 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
*sptr = '\0';
}
static __thread x16r_context_overlay hex_ctx;
int hex_hash( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &hex_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
@@ -52,7 +51,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
@@ -87,7 +86,7 @@ int hex_hash( void* output, const void* input, int thrid )
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
}
else
{
@@ -97,7 +96,7 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
{
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -108,26 +107,15 @@ int hex_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
simd512_ctx( &ctx.simd, hash, in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,
(const BitSequence *)in, size );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
@@ -216,32 +204,32 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
switch ( algo )
{
case JH:
sph_jh512_init( &hex_ctx.jh );
sph_jh512( &hex_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &hex_ctx.skein );
sph_skein512( &hex_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case LUFFA:
init_luffa( &hex_ctx.luffa, 512 );
update_luffa( &hex_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &hex_ctx.cube, 512, 16, 32 );
cubehashUpdate( &hex_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &hex_ctx.hamsi );
sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 64 );
break;
case SHABAL:
sph_shabal512_init( &hex_ctx.shabal );
sph_shabal512( &hex_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &hex_ctx.whirlpool );
sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}

View File

@@ -11,29 +11,29 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/yespower/yespower.h"
//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
//#else
#else
#include "algo/echo/sph_echo.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
//#endif
#if defined(__AES__)
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/simd/nist.h"
// Config
#define MINOTAUR_ALGO_COUNT 16
@@ -47,14 +47,17 @@ typedef struct TortureGarden TortureGarden;
// Graph of hash algos plus SPH contexts
struct TortureGarden
{
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_echo512_context echo;
sph_groestl512_context groestl;
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
@@ -67,11 +70,7 @@ struct TortureGarden
cubehashParam cube;
shavite512_context shavite;
hashState_luffa luffa;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -93,9 +92,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
switch ( algo )
{
case 0:
blake512_init( &garden->blake );
blake512_update( &garden->blake, input, 64 );
blake512_close( &garden->blake, hash );
blake512_full( &garden->blake, hash, input, 64 );
break;
case 1:
sph_bmw512_init( &garden->bmw );
@@ -107,7 +104,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
cubehashUpdateDigest( &garden->cube, hash, input, 64 );
break;
case 3:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &garden->echo, hash, 512, input, 64 );
#else
sph_echo512_init( &garden->echo );
@@ -116,14 +113,14 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
#endif
break;
case 4:
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &garden->fugue, hash, input, 64 );
#else
sph_fugue512_full( &garden->fugue, hash, input, 64 );
#endif
break;
case 5:
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &garden->groestl, hash, input, 512 );
#else
sph_groestl512_init( &garden->groestl) ;
@@ -165,13 +162,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
sph_shavite512_close( &garden->shavite, hash );
break;
case 13:
#if defined(__aarch64__)
sph_simd512_init( &garden->simd );
sph_simd512( &garden->simd, input, 64);
sph_simd512_close( &garden->simd, hash );
#else
simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
#endif
simd512_ctx( &garden->simd, hash, input, 64 );
break;
case 14:
sph_skein512_init( &garden->skein );

View File

@@ -19,12 +19,12 @@
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_prehash( void *vdata, void *pdata )
void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -110,7 +110,8 @@ void x16r_8way_prehash( void *vdata, void *pdata )
// Called by wrapper hash function to optionally continue hashing and
// convert to final hash.
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
int x16r_8way_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -136,9 +137,9 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -474,7 +475,8 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
int x16r_8way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8way_hash_generic( hash, input, thrid ) )
if ( !x16r_8way_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -495,7 +497,6 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -508,21 +509,18 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -546,12 +544,12 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
#elif defined (X16R_4WAY)
void x16r_4way_prehash( void *vdata, void *pdata )
void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -627,7 +625,8 @@ void x16r_4way_prehash( void *vdata, void *pdata )
}
}
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
int x16r_4way_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -644,9 +643,9 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -908,7 +907,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
int x16r_4way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4way_hash_generic( hash, input, thrid ) )
if ( !x16r_4way_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -924,7 +924,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -937,20 +936,18 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -971,4 +968,404 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_2WAY)
void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16r_ctx.jh );
jh512_2x64_update( &x16r_ctx.jh, vdata, 64 );
break;
case KECCAK:
v128_bswap32_intrlv80_2x64( vdata, pdata );
keccak512_2x64_init( &x16r_ctx.keccak );
keccak512_2x64_update( &x16r_ctx.keccak, vdata, 72 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
v128_bswap32_80( edata, pdata );
init_luffa( &x16r_ctx.luffa, 512 );
update_luffa( &x16r_ctx.luffa, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case CUBEHASH:
{
v128_bswap32_80( edata, pdata );
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
}
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16r_ctx.hamsi );
hamsi512_2x64_update( &x16r_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16r_ctx.hamsi );
sph_hamsi512( &x16r_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16r_ctx.fugue );
sph_fugue512( &x16r_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16r_ctx.shabal );
sph_shabal512( &x16r_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
case WHIRLPOOL:
v128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16r_ctx.whirlpool );
sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
}
int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16r_2x64_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < func_count; i++ )
{
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64*2), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
keccak512_2x64_update( &ctx.keccak, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, size );
}
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
update_and_final_luffa( &ctx.luffa, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
update_and_final_luffa( &ctx.luffa, hash1, in1 + 64, 16 );
}
else
{
luffa_full( &ctx.luffa, hash0, 512, hash0, size );
luffa_full( &ctx.luffa, hash1, 512, hash1, size );
}
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
if ( i == 0 )
{
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
}
else
{
sph_whirlpool512_full( &ctx.whirlpool, hash0, hash0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, hash1, size );
}
break;
case SHA_512:
sha512_2x64_init( &ctx.sha512 );
if ( i == 0 )
sha512_2x64_update( &ctx.sha512, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, size );
}
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 64 );
memcpy( output+64, hash1, 64 );
return 1;
}
int x16r_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*2] __attribute__ ((aligned (64)));
if ( !x16r_2x64_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
return 1;
}
int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128_t *noncev = (v128_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -5,18 +5,21 @@ __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
__thread x16r_8way_context_overlay x16r_ctx;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
__thread x16r_4way_context_overlay x16r_ctx;
#elif defined(X16R_2WAY)
__thread x16r_2x64_context_overlay x16r_ctx;
#endif
__thread x16r_context_overlay x16_ctx;
__thread x16r_context_overlay x16r_ref_ctx;
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
{
@@ -52,17 +55,21 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -70,17 +77,21 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16RV2_8WAY)
#if defined(X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16RV2_4WAY)
#elif defined(X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#elif defined(X16RV2_2WAY)
gate->scanhash = (void*)&scanhash_x16rv2_2x64;
gate->hash = (void*)&x16rv2_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -88,17 +99,21 @@ bool register_x16rv2_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16R_2WAY)
gate->scanhash = (void*)&scanhash_x16r_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -108,7 +123,6 @@ bool register_x16s_algo( algo_gate_t* gate )
//
// X16RT
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
{
int32_t maskedTime = timeStamp & 0xffffff80;
@@ -221,34 +235,42 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 256.0;
return true;
};
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X16RT_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
#elif defined(X16RT_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#elif defined(X16RT_2WAY)
gate->scanhash = (void*)&scanhash_x16rt_2x64;
gate->hash = (void*)&x16r_2x64_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -262,7 +284,7 @@ bool register_hex_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&x16r_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 128.0;
return true;
@@ -274,20 +296,25 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
#if defined(X21S_8WAY)
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
#elif defined (X16R_4WAY)
#elif defined(X21S_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#elif defined(X21S_2WAY)
gate->scanhash = (void*)&scanhash_x21s_2x64;
gate->hash = (void*)&x21s_2x64_hash;
gate->miner_thread_init = (void*)&x21s_2x64_thread_init;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;

View File

@@ -7,13 +7,15 @@
#include <unistd.h>
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
@@ -21,13 +23,13 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha512-hash.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/fugue/fugue-aesni.h"
#endif
#if defined (__AVX2__)
//#if defined (__AVX2__)
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
@@ -39,7 +41,7 @@
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#endif
//#endif
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
@@ -48,28 +50,41 @@
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
// X16R, X16S
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16R_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#define X16RV2_8WAY 1
#define X16RT_8WAY 1
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#define X16RT_4WAY 1
#define X21S_4WAY 1
#define X16R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RV2_2WAY 1
#endif
// X16RT, VEIL
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16RT_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RT_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X16RT_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X21S_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X21S_2WAY 1
#endif
enum x16r_Algo {
BLAKE = 0,
BMW,
@@ -134,18 +149,23 @@ union _x16r_8way_context_overlay
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
#define _x16r_8x64_context_overlay _x16r_8way_context_overlay
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
#define x16r_8x64_context_overlay x16r_8way_context_overlay
extern __thread x16r_8way_context_overlay x16r_ctx;
void x16r_8way_prehash( void *, void * );
int x16r_8way_hash_generic( void *, const void *, int );
void x16r_8way_prehash( void *, void *, const char * );
int x16r_8way_hash_generic( void *, const void *, int, const char*, const int );
int x16r_8way_hash( void *, const void *, int );
int scanhash_x16r_8way( struct work *, uint32_t ,
uint64_t *, struct thr_info * );
extern __thread x16r_8way_context_overlay x16r_ctx;
#define x16r_8x64_prehash x16r_8way_prehash
#define x16r_8x64_hash_generic x16r_8way_hash_generic
#define x16r_8x64_hash x16r_8way_hash
#define scanhash_x16r_8x64 scanhash_x16r_8x64
#elif defined(X16R_4WAY)
@@ -167,7 +187,6 @@ union _x16r_4way_context_overlay
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
hashState_luffa luffa1;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -175,46 +194,102 @@ union _x16r_4way_context_overlay
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
} __attribute__ ((aligned (64)));
#define _x16r_4x64_context_overlay _x16r_4way_context_overlay
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
#define x16r_4x64_context_overlay x16r_4way_context_overlay
extern __thread x16r_4way_context_overlay x16r_ctx;
void x16r_4way_prehash( void *, void * );
int x16r_4way_hash_generic( void *, const void *, int );
void x16r_4way_prehash( void *, void *, const char * );
int x16r_4way_hash_generic( void *, const void *, int, const char*, const int );
int x16r_4way_hash( void *, const void *, int );
int scanhash_x16r_4way( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_4way_context_overlay x16r_ctx;
#define x16r_4x64_prehash x16r_4way_prehash
#define x16r_4x64_hash_generic x16r_4way_hash_generic
#define x16r_4x64_hash x16r_4way_hash
#define scanhash_x16r_4x64 scanhash_x16r_4x64
#elif defined(X16R_2WAY)
union _x16r_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
} __attribute__ ((aligned (64)));
typedef union _x16r_2x64_context_overlay x16r_2x64_context_overlay;
void x16r_2x64_prehash( void *, void *, const char * );
int x16r_2x64_hash_generic( void *, const void *, int, const char*, const int );
int x16r_2x64_hash( void *, const void *, int );
int scanhash_x16r_2x64( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_2x64_context_overlay x16r_ctx;
#endif
// need a reference, add hooks for SSE2.
// needed for hex
union _x16r_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
hashState_sd simd;
sph_echo512_context echo;
#endif
sph_hamsi512_context hamsi;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha512;
@@ -222,10 +297,10 @@ union _x16r_context_overlay
typedef union _x16r_context_overlay x16r_context_overlay;
extern __thread x16r_context_overlay x16_ctx;
extern __thread x16r_context_overlay x16r_ref_ctx;
void x16r_prehash( void *, void * );
int x16r_hash_generic( void *, const void *, int );
void x16r_prehash( void *, void *, const char * );
int x16r_hash_generic( void *, const void *, int, const char*, const int );
int x16r_hash( void *, const void *, int );
int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
@@ -242,6 +317,12 @@ int x16rv2_4way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_2WAY)
int x16rv2_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
int x16rv2_hash( void *state, const void *input, int thr_id );
@@ -251,18 +332,24 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
#endif
// x16rt, veil
#if defined(X16R_8WAY)
#if defined(X16RT_8WAY)
//void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
#elif defined(X16RT_4WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RT_2WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
//void x16rt_hash( void *state, const void *input );
@@ -272,20 +359,27 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
#endif
// x21s
#if defined(X16R_8WAY)
#if defined(X21S_8WAY)
int x21s_8way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X16R_4WAY)
#elif defined(X21S_4WAY)
int x21s_4way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#elif defined(X21S_2WAY)
int x21s_2x64_hash( void *state, const void *input, int thrid );
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_2x64_thread_init();
#else
int x21s_hash( void *state, const void *input, int thr_id );

View File

@@ -10,55 +10,60 @@
#include <stdlib.h>
#include <string.h>
void x16r_prehash( void *edata, void *pdata )
void x16r_prehash( void *edata, void *pdata, const char *hash_order )
{
const char elem = x16r_hash_order[0];
const char elem = hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
sph_jh512_init( &x16_ctx.jh );
sph_jh512( &x16_ctx.jh, edata, 64 );
sph_jh512_init( &x16r_ref_ctx.jh );
sph_jh512( &x16r_ref_ctx.jh, edata, 64 );
break;
case SKEIN:
sph_skein512_init( &x16_ctx.skein );
sph_skein512( &x16_ctx.skein, edata, 64 );
sph_skein512_init( &x16r_ref_ctx.skein );
sph_skein512( &x16r_ref_ctx.skein, edata, 64 );
break;
case KECCAK:
sph_keccak512_init( &x16r_ref_ctx.keccak );
sph_keccak512( &x16r_ref_ctx.keccak, edata, 72 );
break;
case LUFFA:
init_luffa( &x16_ctx.luffa, 512 );
update_luffa( &x16_ctx.luffa, edata, 64 );
init_luffa( &x16r_ref_ctx.luffa, 512 );
update_luffa( &x16r_ref_ctx.luffa, edata, 64 );
break;
case CUBEHASH:
cubehashInit( &x16_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16_ctx.cube, edata, 64 );
cubehashInit( &x16r_ref_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16r_ref_ctx.cube, edata, 64 );
break;
case HAMSI:
sph_hamsi512_init( &x16_ctx.hamsi );
sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
break;
sph_hamsi512_init( &x16r_ref_ctx.hamsi );
sph_hamsi512( &x16r_ref_ctx.hamsi, edata, 72 );
break;
case SHABAL:
sph_shabal512_init( &x16_ctx.shabal );
sph_shabal512( &x16_ctx.shabal, edata, 64 );
sph_shabal512_init( &x16r_ref_ctx.shabal );
sph_shabal512( &x16r_ref_ctx.shabal, edata, 64 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &x16_ctx.whirlpool );
sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
sph_whirlpool_init( &x16r_ref_ctx.whirlpool );
sph_whirlpool( &x16r_ref_ctx.whirlpool, edata, 64 );
break;
}
}
int x16r_hash_generic( void* output, const void* input, int thrid )
int x16r_hash_generic( void* output, const void* input, int thrid,
const char *hash_order, const int func_count )
{
uint32_t _ALIGN(128) hash[16];
uint32_t _ALIGN(32) hash[16];
x16r_context_overlay ctx;
memcpy( &ctx, &x16_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ref_ctx, sizeof(ctx) );
void *in = (void*) input;
int size = 80;
for ( int i = 0; i < 16; i++ )
for ( int i = 0; i < func_count; i++ )
{
const char elem = x16r_hash_order[i];
const char elem = hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
@@ -70,36 +75,41 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
sph_bmw512( &ctx.bmw, in, size );
sph_bmw512_close( &ctx.bmw, hash );
break;
case GROESTL:
#if defined(__AES__)
groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash, in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512_close( &ctx.groestl, hash );
#endif
break;
case JH:
if ( i == 0 )
sph_jh512(&ctx.jh, in+64, 16 );
sph_jh512( &ctx.jh, in+64, 16 );
else
{
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512( &ctx.jh, in, size );
}
sph_jh512_close(&ctx.jh, hash );
sph_jh512_close( &ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
if ( i == 0 )
sph_keccak512( &ctx.keccak, in+72, 8 );
else
{
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
}
sph_keccak512_close( &ctx.keccak, hash );
break;
case SKEIN:
if ( i == 0 )
sph_skein512(&ctx.skein, in+64, 16 );
sph_skein512( &ctx.skein, in+64, 16 );
else
{
sph_skein512_init( &ctx.skein );
@@ -109,13 +119,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case LUFFA:
if ( i == 0 )
update_and_final_luffa( &ctx.luffa, hash, (const void*)in+64, 16 );
update_and_final_luffa( &ctx.luffa, hash, in+64, 16 );
else
luffa_full( &ctx.luffa, hash, 512, in, size );
break;
case CUBEHASH:
if ( i == 0 )
cubehashUpdateDigest( &ctx.cube, hash, (const void*)in+64, 16 );
cubehashUpdateDigest( &ctx.cube, hash, in+64, 16 );
else
cubehash_full( &ctx.cube, hash, 512, in, size );
break;
@@ -123,19 +133,13 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
sph_simd512( &ctx.simd, hash, size );
sph_simd512_close( &ctx.simd, hash );
break;
case ECHO:
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence*)hash, 512,
(const BitSequence*)in, size );
echo_full( &ctx.echo, hash, 512, in, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
@@ -144,7 +148,7 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
sph_hamsi512( &ctx.hamsi, in+64, 16 );
sph_hamsi512( &ctx.hamsi, in+72, 8 );
else
{
sph_hamsi512_init( &ctx.hamsi );
@@ -153,12 +157,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
sph_fugue512_full( &ctx.fugue, hash, in, size );
break;
case SHABAL:
if ( i == 0 )
sph_shabal512( &ctx.shabal, in+64, 16 );
@@ -197,7 +197,8 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
int x16r_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid ) )
if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
@@ -207,8 +208,8 @@ int x16r_hash( void* output, const void* input, int thrid )
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -230,7 +231,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -3,7 +3,7 @@
#include <stdlib.h>
#include <string.h>
#if defined (X16R_8WAY)
#if defined (X16RT_8WAY)
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -30,12 +30,12 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -57,7 +57,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16R_4WAY)
#elif defined (X16RT_4WAY)
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
@@ -84,12 +84,12 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -110,4 +110,55 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RT_2WAY)
int scanhash_x16rt_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x",
x16r_hash_order, bswap_32( pdata[17] ) );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,6 +1,6 @@
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RT_8WAY) && !defined(X16RT_4WAY)
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -31,7 +31,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
x16r_hash_order, swab32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -395,7 +395,7 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -409,14 +409,43 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
hash7, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in4 + 76, 4 );
fugue512_final( &ctx.fugue, hash4 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in5 + 76, 4 );
fugue512_final( &ctx.fugue, hash5 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in6 + 76, 4 );
fugue512_final( &ctx.fugue, hash6 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in7 + 76, 4 );
fugue512_final( &ctx.fugue, hash7 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
fugue512_full( &ctx.fugue, hash4, hash4, size );
fugue512_full( &ctx.fugue, hash5, hash5, size );
fugue512_full( &ctx.fugue, hash6, hash6, size );
fugue512_full( &ctx.fugue, hash7, hash7, size );
}
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -564,7 +593,6 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -577,19 +605,15 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -626,7 +650,14 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -699,7 +730,7 @@ typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
static __thread x16rv2_4way_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
static inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
@@ -824,8 +855,8 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way_update( &ctx.skein, vhash, size );
skein512_4way_close( &ctx.skein, vhash );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case LUFFA:
@@ -945,7 +976,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -956,10 +987,27 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
fugue512_full( &ctx.fugue, hash2, hash2, size );
fugue512_full( &ctx.fugue, hash3, hash3, size );
}
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
@@ -1055,7 +1103,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -1068,17 +1115,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
@@ -1101,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
@@ -1112,7 +1157,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_intrlv80_4x32( vdata32, pdata );
@@ -1151,4 +1202,450 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X16RV2_2WAY)
union _x16rv2_2x64_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
skein512_2x64_context skein;
jh512_2x64_context jh;
keccak512_2x64_context keccak;
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sha512_2x64_context sha512;
sph_tiger_context tiger;
} __attribute__ ((aligned (64)));
typedef union _x16rv2_2x64_context_overlay x16rv2_2x64_context_overlay;
static __thread x16rv2_2x64_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
static inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
int x16rv2_2x64_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[20*2] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
x16rv2_2x64_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
int size = 80;
dintrlv_2x64( hash0, hash1, input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = x16r_hash_order[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
if ( i == 0 )
blake512_2x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
blake512_2x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case BMW:
bmw512_2x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_2x64_update( &ctx.bmw, input, size );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
bmw512_2x64_update( &ctx.bmw, vhash, size );
}
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case GROESTL:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, in0, size<<3 );
groestl512_full( &ctx.groestl, hash1, in1, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in0, size );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in1, size );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
break;
case JH:
if ( i == 0 )
jh512_2x64_update( &ctx.jh, input + (64<<1), 16 );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
jh512_2x64_init( &ctx.jh );
jh512_2x64_update( &ctx.jh, vhash, size );
}
jh512_2x64_close( &ctx.jh, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case KECCAK:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
keccak512_2x64_init( &ctx.keccak );
keccak512_2x64_update( &ctx.keccak, vhash, 64 );
keccak512_2x64_close( &ctx.keccak, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_2x64_final16( &ctx.skein, vhash, input + (64*2) );
else
{
intrlv_2x64( vhash, in0, in1, size<<3 );
skein512_2x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
case LUFFA:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
break;
case CUBEHASH:
if ( i == 0 )
{
cubehashUpdateDigest( &ctx.cube, hash0, in0 + 64, 16 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
cubehashUpdateDigest( &ctx.cube, hash1, in1 + 64, 16 );
}
else
{
cubehash_full( &ctx.cube, hash0, 512, hash0, size );
cubehash_full( &ctx.cube, hash1, 512, hash1, size );
}
break;
case SHAVITE:
shavite512_full( &ctx.shavite, hash0, in0, size );
shavite512_full( &ctx.shavite, hash1, in1, size );
break;
case SIMD:
simd512_ctx( &ctx.simd, hash0, in0, size );
simd512_ctx( &ctx.simd, hash1, in1, size );
break;
case ECHO:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, in0, size );
echo_full( &ctx.echo, hash1, 512, in1, size );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in0, size );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in1, size );
sph_echo512_close( &ctx.echo, hash1 );
#endif
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
if ( i == 0 )
hamsi512_2x64_update( &ctx.hamsi, input + (72*2), 8 );
else
{
intrlv_2x64( vhash, hash0, hash1, size<<3 );
hamsi512_2x64_init( &ctx.hamsi );
hamsi512_2x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_2x64_close( &ctx.hamsi, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
if ( i == 0 )
{
sph_hamsi512( &ctx.hamsi, in0 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_hamsi512( &ctx.hamsi, in1 + 72, 8 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
else
{
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash0, size );
sph_hamsi512_close( &ctx.hamsi, hash0 );
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash1, size );
sph_hamsi512_close( &ctx.hamsi, hash1 );
}
#endif
break;
case FUGUE:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
}
else
{
fugue512_full( &ctx.fugue, hash0, hash0, size );
fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#else
if ( i == 0 )
{
sph_fugue512( &ctx.fugue, in0 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, in1 + 76, 4 );
sph_fugue512_close( &ctx.fugue, hash1 );
}
else
{
sph_fugue512_full( &ctx.fugue, hash0, hash0, size );
sph_fugue512_full( &ctx.fugue, hash1, hash1, size );
}
#endif
break;
case SHABAL:
if ( i == 0 )
{
sph_shabal512( &ctx.shabal, in0 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_shabal512( &ctx.shabal, in1 + 64, 16 );
sph_shabal512_close( &ctx.shabal, hash1 );
}
else
{
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash0, size );
sph_shabal512_close( &ctx.shabal, hash0 );
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hash1, size );
sph_shabal512_close( &ctx.shabal, hash1 );
}
break;
case WHIRLPOOL:
sph_whirlpool512_full( &ctx.whirlpool, hash0, in0, size );
sph_whirlpool512_full( &ctx.whirlpool, hash1, in1, size );
break;
case SHA_512:
if ( i == 0 )
{
sph_tiger( &ctx.tiger, in0 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash0 );
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
sph_tiger( &ctx.tiger, in1 + 64, 16 );
sph_tiger_close( &ctx.tiger, hash1 );
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
}
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x64( vhash, hash0, hash1, 512 );
sha512_2x64_init( &ctx.sha512 );
sha512_2x64_update( &ctx.sha512, vhash, 64 );
sha512_2x64_close( &ctx.sha512, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
break;
}
if ( work_restart[thrid].restart ) return 0;
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
return 1;
}
int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[2*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0fff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
v128_bswap32_intrlv80_2x64( vdata, pdata );
jh512_2x64_init( &x16rv2_ctx.jh );
jh512_2x64_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
v128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SKEIN:
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, edata, 64 );
intrlv_2x64( vdata, edata, edata, 640 );
break;
case HAMSI:
#if defined(__SSE4_2__) || defined(__ARM_NEON)
v128_bswap32_intrlv80_2x64( vdata, pdata );
hamsi512_2x64_init( &x16rv2_ctx.hamsi );
hamsi512_2x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
#else
v128_bswap32_80( edata, pdata );
sph_hamsi512_init( &x16rv2_ctx.hamsi );
sph_hamsi512( &x16rv2_ctx.hamsi, edata, 72 );
intrlv_2x64( vdata, edata, edata, 640 );
#endif
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_init( &x16rv2_ctx.fugue );
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
#else
sph_fugue512_init( &x16rv2_ctx.fugue );
sph_fugue512( &x16rv2_ctx.fugue, edata, 76 );
#endif
intrlv_2x64( vdata, edata, edata, 640 );
break;
case SHABAL:
v128_bswap32_80( edata, pdata );
sph_shabal512_init( &x16rv2_ctx.shabal );
sph_shabal512( &x16rv2_ctx.shabal, edata, 64);
intrlv_2x64( vdata, edata, edata, 640 );
break;
default:
v128_bswap32_intrlv80_2x64( vdata, pdata );
}
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -6,21 +6,15 @@
*/
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X16RV2_8WAY) && !defined(X16RV2_4WAY) && !defined(X16RV2_2WAY)
#include "algo/tiger/sph_tiger.h"
union _x16rv2_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -29,11 +23,7 @@ union _x16rv2_context_overlay
hashState_luffa luffa;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -43,7 +33,7 @@ union _x16rv2_context_overlay
typedef union _x16rv2_context_overlay x16rv2_context_overlay;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512(uint32_t* hash) {
static inline void padtiger512(uint32_t* hash) {
for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
}
@@ -72,15 +62,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
@@ -117,25 +101,14 @@ int x16rv2_hash( void* output, const void* input, int thrid )
shavite512_full( &ctx.shavite, hash, in, size );
break;
case SIMD:
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512(&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#endif
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
@@ -143,11 +116,7 @@ int x16rv2_hash( void* output, const void* input, int thrid )
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, in, size );
#else
sph_fugue512_full( &ctx.fugue, hash, in, size );
#endif
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );

362
algo/x16/x20r.c Normal file
View File

@@ -0,0 +1,362 @@
#include "miner.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "x16r-gate.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X20R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X20R_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X20R_2WAY 1
#endif
// X20R is not what it seems. It does not permute 20 functions over 20 rounds,
// it only permutes 16 of them. The last 4 functions are victims of trying to
// fit 20 elements in the space for only 16. Arithmetic overflow recycles the
// first 4 functions. Otherwise it's identical to X16R.
// Welcome to the real X20R.
#define X20R_HASH_FUNC_COUNT 20
/*
enum x20r_algo
{
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA512,
HAVAL, // Last 4 names are meaningless and not used
GOST,
RADIOGATUN,
PANAMA,
X20R_HASH_FUNC_COUNT
};
*/
static __thread char x20r_hash_order[ X20R_HASH_FUNC_COUNT + 1 ] = {0};
static void x20r_getAlgoString(const uint8_t* prevblock, char *output)
{
char *sptr = output;
for (int j = 0; j < X20R_HASH_FUNC_COUNT; j++) {
uint8_t b = (19 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
#if defined(X20R_8WAY)
int x20r_8x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
memcpy( output+64, hash+128, 32 );
memcpy( output+96, hash+192, 32 );
memcpy( output+128, hash+256, 32 );
memcpy( output+160, hash+320, 32 );
memcpy( output+192, hash+384, 32 );
memcpy( output+224, hash+448, 32 );
return 1;
}
int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_8x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x20r_8x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X20R_4WAY)
int x20r_4x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
memcpy( output+64, hash+128, 32 );
memcpy( output+96, hash+192, 32 );
return 1;
}
int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_4x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X20R_2WAY)
int x20r_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*2] __attribute__ ((aligned (64)));
if ( !x16r_2x64_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
memcpy( output+32, hash+64, 32 );
return 1;
}
int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128_t *noncev = (v128_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
vdata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&vdata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x20r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#else
int x20r_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid, x20r_hash_order,
X20R_HASH_FUNC_COUNT ) )
return 0;
memcpy( output, hash, 32 );
return 1;
}
int scanhash_x20r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &( work_restart[thr_id].restart );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
edata[1] = bswap_32( pdata[1] );
edata[2] = bswap_32( pdata[2] );
edata[3] = bswap_32( pdata[3] );
saved_height = work->height;
x20r_getAlgoString( (const uint8_t*)(&edata[1]), x20r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x20r_hash_order );
}
x16r_prehash( edata, pdata, x20r_hash_order );
do
{
edata[19] = nonce;
if ( x20r_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif
bool register_x20r_algo( algo_gate_t* gate )
{
#if defined (X20R_8WAY)
gate->scanhash = (void*)&scanhash_x20r_8x64;
#elif defined (X20R_4WAY)
gate->scanhash = (void*)&scanhash_x20r_4x64;
#elif defined (X20R_2WAY)
gate->scanhash = (void*)&scanhash_x20r_2x64;
#else
gate->scanhash = (void*)&scanhash_x20r;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 256.0;
return true;
};

View File

@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include "algo/haval/haval-hash-4way.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
@@ -42,7 +43,8 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
uint32_t *hash7 = (uint32_t*)( shash+448 );
x21s_8way_context_overlay ctx;
if ( !x16r_8way_hash_generic( shash, input, thrid ) )
if ( !x16r_8way_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
@@ -134,7 +136,6 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -148,20 +149,18 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_8way_prehash( vdata, pdata );
x16r_8way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -223,7 +222,8 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
uint32_t *hash2 = (uint32_t*)( shash+128 );
uint32_t *hash3 = (uint32_t*)( shash+192 );
if ( !x16r_4way_hash_generic( shash, input, thrid ) )
if ( !x16r_4way_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -294,7 +294,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -307,20 +306,18 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_4way_prehash( vdata, pdata );
x16r_4way_prehash( vdata, pdata, x16r_hash_order );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
@@ -351,4 +348,117 @@ bool x21s_4way_thread_init()
return x21s_4way_matrix;
}
#elif defined (X21S_2WAY)
static __thread uint64_t* x21s_2x64_matrix;
union _x21s_2x64_context_overlay
{
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
} __attribute__ ((aligned (64)));
typedef union _x21s_2x64_context_overlay x21s_2x64_context_overlay;
int x21s_2x64_hash( void* output, const void* input, int thrid )
{
uint8_t shash[64*2] __attribute__ ((aligned (64)));
x21s_2x64_context_overlay ctx;
uint32_t *hash0 = (uint32_t*) shash;
uint32_t *hash1 = (uint32_t*)( shash+64 );
if ( !x16r_2x64_hash_generic( shash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash0, 64 );
sph_haval256_5_close( &ctx.haval, hash0 );
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash1, 64 );
sph_haval256_5_close( &ctx.haval, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash0, 32, (const void*) hash0, 32,
(const void*) hash0, 32, 1, 4, 4 );
LYRA2REV2( x21s_2x64_matrix, (void*) hash1, 32, (const void*) hash1, 32,
(const void*) hash1, 32, 1, 4, 4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sha256_full( output, hash0, 64 );
sha256_full( output+32, hash1, 64 );
return 1;
}
int scanhash_x21s_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_t *noncev = (v128_t*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t saved_height = UINT32_MAX;
if ( work->height != saved_height )
{
vdata[1] = bswap_32( pdata[1] );
vdata[2] = bswap_32( pdata[2] );
saved_height = work->height;
x16_r_s_getAlgoString( (const uint8_t*)(&vdata[1]), x16r_hash_order );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s", x16r_hash_order );
}
x16r_2x64_prehash( vdata, pdata, x16r_hash_order );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_2x64_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_2x64_matrix = mm_malloc( size, 64 );
return x21s_2x64_matrix;
}
#endif

View File

@@ -15,7 +15,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
#if !defined(X21S_8WAY) && !defined(X21S_4WAY)
static __thread uint64_t* x21s_matrix;
@@ -33,7 +33,8 @@ int x21s_hash( void* output, const void* input, int thrid )
uint32_t _ALIGN(128) hash[16];
x21s_context_overlay ctx;
if ( !x16r_hash_generic( hash, input, thrid ) )
if ( !x16r_hash_generic( hash, input, thrid, x16r_hash_order,
X16R_HASH_FUNC_COUNT ) )
return 0;
sph_haval256_5_init( &ctx.haval );
@@ -84,7 +85,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
x16r_prehash( edata, pdata, x16r_hash_order );
do
{

View File

@@ -928,25 +928,24 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
#elif defined(X17_2X64)
// Need sph in some cases
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/hamsi/sph_hamsi.h"
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
#include "algo/hamsi/sph_hamsi.h"
#endif
#include "algo/shabal/sph_shabal.h"
#include "algo/haval/sph-haval.h"
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
//#endif
#include "algo/fugue/sph_fugue.h"
#include "algo/fugue/sph_fugue.h"
#endif
union _x17_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
@@ -956,7 +955,7 @@ union _x17_context_overlay
#else
sph_echo512_context echo;
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
@@ -967,12 +966,8 @@ union _x17_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__x86_64__)
simd512_context simd;
#else
sph_simd512_context simd;
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
hamsi_2x64_context hamsi;
#else
sph_hamsi512_context hamsi;
@@ -1000,7 +995,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
@@ -1033,17 +1028,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
#if defined(__x86_64__)
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#else
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash0, 64 );
sph_simd512_close( &ctx.simd, hash0 );
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash1, 64 );
sph_simd512_close( &ctx.simd, hash1 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
@@ -1057,7 +1043,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_echo512_close( &ctx.echo, hash1 );
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) || defined(__ARM_NEON)
intrlv_2x64( vhash, hash0, hash1, 512 );
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
@@ -1070,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_hamsi512_close( &ctx.hamsi, hash1 );
#endif
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
fugue512_full( &ctx.fugue, hash1, hash1, 64 );
#else
@@ -1142,14 +1128,12 @@ int scanhash_x17_2x64( struct work *work, uint32_t max_nonce,
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,0);
pdata[19] = bswap_32( n );
// pdata[19] = n;
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
applog(LOG_INFO,"Submitted Thread %d, lane %d",thr_id,1);
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}

View File

@@ -4,25 +4,24 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -39,14 +38,17 @@ union _x22i_context_overlay
{
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -54,11 +56,7 @@ union _x22i_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -83,10 +81,8 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_bmw512(&ctx.bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash, hash, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash, 64 );
@@ -109,26 +105,16 @@ int x22i_hash( void *output, const void *input, int thrid )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init(&ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash, 512, hash, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash, 64 );
@@ -141,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hash, hash, 64 );
#else
sph_fugue512_init(&ctx.fugue);
@@ -161,7 +147,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_sha512( &ctx.sha512, &hash[128], 64 );
sph_sha512_close( &ctx.sha512, &hash[192] );
ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
ComputeSingleSWIFFTX( (unsigned char*)hash, (unsigned char*)hash2 );
if ( work_restart[thrid].restart ) return 0;
@@ -176,7 +162,7 @@ int x22i_hash( void *output, const void *input, int thrid )
sph_tiger_close(&ctx.tiger, (void*) hash2);
memset(hash, 0, 64);
LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
LYRA2RE( (void*)hash, 32, (const void*)hash2, 32, (const void*)hash2, 32, 1, 4, 4 );
sph_gost512_init(&ctx.gost);
sph_gost512 (&ctx.gost, (const void*) hash, 64);
@@ -192,8 +178,8 @@ int x22i_hash( void *output, const void *input, int thrid )
int scanhash_x22i( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];

View File

@@ -4,25 +4,24 @@
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
@@ -42,14 +41,17 @@ union _x25x_context_overlay
{
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
@@ -57,11 +59,7 @@ union _x25x_context_overlay
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -88,10 +86,8 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
sph_bmw512_close(&ctx.bmw, &hash[1]);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)&hash[2],
(const char*)&hash[1], 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, (void*)&hash[2], (const void*)&hash[1], 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, &hash[1], 64 );
@@ -112,28 +108,18 @@ int x25x_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
init_luffa( &ctx.luffa, 512 );
luffa_full( &ctx.luffa, &hash[6], 512, &hash[5], 64 );
luffa_full( &ctx.luffa, (void*)&hash[6], 512, (const void*)&hash[5], 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, &hash[7], &hash[6], 64 );
cubehash_full( &ctx.cube, (void*)&hash[7], 512, (const void*)&hash[6], 64 );
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
sph_shavite512_close(&ctx.shavite, &hash[8]);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) &hash[8], 64);
sph_simd512_close(&ctx.simd, &hash[9] );
#else
update_final_sd( &ctx.simd, (BitSequence *)&hash[9],
(const BitSequence *)&hash[8], 512 );
#endif
simd512_ctx( &ctx.simd, (void*)&hash[9], (const void*)&hash[8], 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)&hash[10],
(const BitSequence*)&hash[9], 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, (void*)&hash[10], 512, (const void*)&hash[9], 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, &hash[9], 64 );
@@ -146,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
sph_hamsi512_close(&ctx.hamsi, &hash[11]);
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
#else
sph_fugue512_init(&ctx.fugue);
@@ -227,8 +213,8 @@ int x25x_hash( void *output, const void *input, int thrid )
int scanhash_x25x( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t edata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t edata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -245,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x25x_hash( hash64, edata, thr_id ) )
if ( x25x_hash( hash64, edata, thr_id ) );
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

View File

@@ -9,6 +9,6 @@ rm -f config.status
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer

View File

@@ -10,33 +10,33 @@ make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-aes-sha2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto+sha2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-sha2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto+aes -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8-aes
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-armv8
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=native -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer

View File

@@ -13,7 +13,7 @@ rm -f config.status
CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl
# Rocketlake needs gcc-11
#CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes
@@ -34,7 +34,7 @@ rm -f config.status
# Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen4
@@ -43,7 +43,7 @@ make clean || echo clean
rm -f config.status
#CFLAGS="-O3 -march=znver2 -mvaes" ./configure --with-curl
CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen3
@@ -51,7 +51,7 @@ mv cpuminer cpuminer-zen3
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=skylake-avx512 -maes -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx512
@@ -60,7 +60,7 @@ make clean || echo done
rm -f config.status
# vaes doesn't include aes
CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx2-sha-vaes
@@ -69,7 +69,7 @@ make clean || echo done
rm -f config.status
#CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl
CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx2-sha
@@ -78,7 +78,7 @@ make clean || echo clean
rm -f config.status
# GCC 9 doesn't include AES with core-avx2
CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx2
@@ -86,7 +86,7 @@ mv cpuminer cpuminer-avx2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx
@@ -94,7 +94,7 @@ mv cpuminer cpuminer-avx
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-aes-sse42
@@ -102,7 +102,7 @@ mv cpuminer cpuminer-aes-sse42
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse42
@@ -110,7 +110,7 @@ mv cpuminer cpuminer-sse42
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-ssse3
@@ -118,7 +118,7 @@ mv cpuminer cpuminer-ssse3
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse2
@@ -126,7 +126,7 @@ mv cpuminer cpuminer-sse2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=x86-64 -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-x64
@@ -134,6 +134,6 @@ mv cpuminer cpuminer-x64
make clean || echo done
rm -f config.status
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer

View File

@@ -10,6 +10,6 @@ rm -f config.status
CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl --host=aarch64-cortexa76-elf --build=x86_64-pc-linux-gnu --target=aarch64-cortexa76-elf
#CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer

View File

@@ -22,6 +22,6 @@ rm -f config.status
CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j $nproc
make -j $(nproc)
strip -s cpuminer

View File

@@ -6,5 +6,5 @@ make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
make -j 4
make -j $(nproc)
strip -s cpuminer

Some files were not shown because too many files have changed in this diff Show More