Compare commits

...

8 Commits

Author SHA1 Message Date
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
Jay D Dee
045b42babf v23.13 2023-11-21 14:18:15 -05:00
Jay D Dee
fc696dbbe5 v23.12 2023-11-20 11:51:57 -05:00
Jay D Dee
f3fde95f27 v23.10 2023-11-15 11:05:41 -05:00
Jay D Dee
0a78013cbe v23.9 2023-11-12 18:48:50 -05:00
Jay D Dee
26b9429589 v23.8 2023-11-11 16:48:57 -05:00
Jay D Dee
e043698442 v23.7 2023-11-07 04:59:44 -05:00
138 changed files with 7282 additions and 18182 deletions

View File

@@ -79,11 +79,6 @@ cpuminer_SOURCES = \
algo/hamsi/hamsi-hash-4way.c \
algo/haval/haval.c \
algo/haval/haval-hash-4way.c \
algo/hodl/aes.c \
algo/hodl/hodl-gate.c \
algo/hodl/hodl-wolf.c \
algo/hodl/sha512_avx.c \
algo/hodl/sha512_avx2.c \
algo/jh/sph_jh.c \
algo/jh/jh-hash-4way.c \
algo/jh/jha-gate.c \
@@ -148,6 +143,8 @@ cpuminer_SOURCES = \
algo/scrypt/scrypt.c \
algo/scrypt/scrypt-core-4way.c \
algo/scrypt/neoscrypt.c \
algo/sha/sha1.c \
algo/sha/sha1-hash.c \
algo/sha/sha256-hash.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
@@ -156,7 +153,6 @@ cpuminer_SOURCES = \
algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
@@ -254,6 +250,7 @@ cpuminer_SOURCES = \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x20r.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x16/minotaur.c \
@@ -279,20 +276,10 @@ cpuminer_SOURCES = \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
if ARCH_x86
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
endif
if ARCH_x86_64
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
endif
if ARCH_ARM
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
endif
else
disable_flags += -DNOASM
endif
@@ -302,7 +289,7 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lssl -lcrypto -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

View File

@@ -87,7 +87,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
- Arm CPU supporting AArch64 and NEON.
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
are not supported. FreeBSD YMMV.
32 bit CPUs are not supported.
ARM requirements (Beta):
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
CPU: Armv8 and NEON, SHA2 & AES are optional
OS: Linux distribution built for AArch64.
Packages: source code only.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
If a mobile CPU can mine it any CPU can.
See wiki for details.
@@ -73,6 +75,63 @@ If not what makes it happen or not happen?
Change Log
----------
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.
Eliminated redundant hash order calculations for x16r family.
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
v23.11
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
v23.10
x86_64: Fixed scrypt, scryptn2 algos SSE2.
Fixed sha512256d algo AVX2, SSE2, NEON.
Fixed a bug in Skein N-way that reduced performance.
ARM: Skein optimized for NEON, SHA2 & SSE2.
Skein2 algo 2-way optimized for NEON & SSE2.
v23.9
x86_64: fixed minotaurx crash, broken in 23.7.
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
v23.8
Cpuminer-opt is no longer dependant on OpenSSL.
Removed Hodl algo.
Removed legacy Sha256 & Scrypt ASM code.
ARM: Echo AES is working and enabled for x17.
v23.7
Fixed blakes2s, broken in v3.23.4.
ARM: SHA2 extension tested and working.
ARM: sha512256d fully optimized.
ARM: X17 more optimizations.
ARM: AES extension working for Shavite.
ARM errata: CPU features AES & SHA256 are not reported when available.
v23.6
ARM: Sha256dt, Sha256t, Sha256d 4-way now working and fully optimized for NEON, SHA also enabled but untested.

View File

@@ -310,7 +310,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;
@@ -369,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
case ALGO_X17: rc = register_x17_algo ( gate ); break;
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;

View File

@@ -99,7 +99,7 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
// AVX10 does not have explicit algo features:

View File

@@ -242,7 +242,7 @@ void fill_segment(const argon2_instance_t *instance,
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];
#else
v128_t state[ARGON2_OWORDS_IN_BLOCK];
v128u64_t state[ARGON2_OWORDS_IN_BLOCK];
#endif
// int data_independent_addressing;

View File

@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );

View File

@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else

View File

@@ -465,6 +465,7 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 += 1;
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
sc->ptr = 0;
}
@@ -479,6 +480,7 @@ void blake512_close( blake512_context *sc, void *dst )
uint64_t th, tl;
ptr = sc->ptr;
memcpy( buf, sc->buf, ptr );
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
@@ -517,8 +519,6 @@ void blake512_close( blake512_context *sc, void *dst )
*(uint64_t*)(buf + 120) = bswap_64( tl );
blake512_update( sc, buf, 128 );
}
//TODO vectored bswap
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
@@ -1779,13 +1779,11 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
v256_64( 0x0100000000000000ULL ) );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
@@ -1793,9 +1791,9 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
buf[104>>3] = v256_64( 0x0100000000000000ULL );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
@@ -1960,21 +1958,21 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
#else // SSE2 & NEON
M0 = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 0] );
M2 = v128_bswap64( sc->buf[ 0] );
M3 = v128_bswap64( sc->buf[ 0] );
M4 = v128_bswap64( sc->buf[ 0] );
M5 = v128_bswap64( sc->buf[ 0] );
M6 = v128_bswap64( sc->buf[ 0] );
M7 = v128_bswap64( sc->buf[ 0] );
M8 = v128_bswap64( sc->buf[ 0] );
M9 = v128_bswap64( sc->buf[ 0] );
MA = v128_bswap64( sc->buf[ 0] );
MB = v128_bswap64( sc->buf[ 0] );
MC = v128_bswap64( sc->buf[ 0] );
MD = v128_bswap64( sc->buf[ 0] );
ME = v128_bswap64( sc->buf[ 0] );
MF = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 1] );
M2 = v128_bswap64( sc->buf[ 2] );
M3 = v128_bswap64( sc->buf[ 3] );
M4 = v128_bswap64( sc->buf[ 4] );
M5 = v128_bswap64( sc->buf[ 5] );
M6 = v128_bswap64( sc->buf[ 6] );
M7 = v128_bswap64( sc->buf[ 7] );
M8 = v128_bswap64( sc->buf[ 8] );
M9 = v128_bswap64( sc->buf[ 9] );
MA = v128_bswap64( sc->buf[10] );
MB = v128_bswap64( sc->buf[11] );
MC = v128_bswap64( sc->buf[12] );
MD = v128_bswap64( sc->buf[13] );
ME = v128_bswap64( sc->buf[14] );
MF = v128_bswap64( sc->buf[15] );
#endif
@@ -2235,7 +2233,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
v128u64_t *buf;
size_t ptr;
const int buf_size = 128; // sizeof/8
DECL_STATE_2X64
buf = sc->buf;
ptr = sc->ptr;
@@ -2247,7 +2244,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
return;
}
READ_STATE64(sc);
while ( len > 0 )
{
size_t clen;
@@ -2260,13 +2256,12 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
len -= clen;
if ( ptr == buf_size )
{
if ( (T0 = T0 + 1024 ) < 1024 )
T1 = T1 + 1;
if ( (sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_2x64_compress( sc );
ptr = 0;
}
}
WRITE_STATE64(sc);
sc->ptr = ptr;
}
@@ -2280,37 +2275,35 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = v128_64( 0x80 );
sc->buf[ptr>>3] = v128_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
{
sc->T0 -= 1024 - bit_len;
}
sc->T0 -= 1024 - bit_len;
if ( ptr <= 104 )
{
v128_memset_zero( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = v128_or( buf[104>>3], v128_64( 0x0100000000000000ULL ) );
buf[112>>3] = v128_64( bswap_64( th ) );
buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
v128_memset_zero( sc->buf + (ptr>>3) + 1, (104-ptr) >> 3 );
sc->buf[104>>3] = v128_or( sc->buf[104>>3],
v128_64( 0x0100000000000000ULL ) );
sc->buf[112>>3] = v128_64( bswap_64( th ) );
sc->buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
}
else
{
v128_memset_zero( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
v128_memset_zero( sc->buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
v128_memset_zero( buf, 112>>3 );
@@ -2319,6 +2312,7 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, buf, 128 );
}
v128_block_bswap64( (v128u64_t*)dst, sc->H );
}
@@ -2326,7 +2320,6 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
void blake512_2x64_full( blake_2x64_big_context *sc, void * dst,
const void *data, size_t len )
{
// init
casti_v128u64( sc->H, 0 ) = v128_64( 0x6A09E667F3BCC908 );

View File

@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -131,47 +131,7 @@
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
v128_t *V = (v128_t*)v; \
v128_t V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))

View File

@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
*/
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm128_rol_32( (x), 4), \
mm128_rol_32( (x), 19) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 3) ), \
v128_xor( v128_rol32( (x), 4), \
v128_rol32( (x), 19) ) )
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 8), \
mm128_rol_32( (x), 23) ) )
v128_xor( v128_xor( v128_sr32( (x), 1), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 8), \
v128_rol32( (x), 23) ) )
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm128_rol_32( (x), 12), \
mm128_rol_32( (x), 25) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 1) ), \
v128_xor( v128_rol32( (x), 12), \
v128_rol32( (x), 25) ) )
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm128_rol_32( (x), 15), \
mm128_rol_32( (x), 29) ) )
v128_xor( v128_xor( v128_sr32( (x), 2), \
v128_sl32( (x), 2) ), \
v128_xor( v128_rol32( (x), 15), \
v128_rol32( (x), 29) ) )
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
v128_xor( (x), v128_sr32( (x), 1 ) )
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
v128_xor( (x), v128_sr32( (x), 2 ) )
#define rs1(x) mm128_rol_32( x, 3 )
#define rs2(x) mm128_rol_32( x, 7 )
#define rs3(x) mm128_rol_32( x, 13 )
#define rs4(x) mm128_rol_32( x, 16 )
#define rs5(x) mm128_rol_32( x, 19 )
#define rs6(x) mm128_rol_32( x, 23 )
#define rs7(x) mm128_rol_32( x, 27 )
#define rs1(x) v128_rol32( x, 3 )
#define rs2(x) v128_rol32( x, 7 )
#define rs3(x) v128_rol32( x, 13 )
#define rs4(x) v128_rol32( x, 16 )
#define rs5(x) v128_rol32( x, 19 )
#define rs6(x) v128_rol32( x, 23 )
#define rs7(x) v128_rol32( x, 27 )
#define rol_off_32( M, j, off ) \
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
v128_xor( \
v128_add32( \
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( mm128_add4_32( \
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
v128_add32( v128_add4_32( \
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
add_elt_s( M, H, (i)-16 ) )
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
// resulting in some sign changes compared to the reference code.
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_add32( v128_xor( M[13], H[13] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws1 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_add32( \
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_xor( M[11], H[11] ) ), \
v128_sub32( v128_xor( M[14], H[14] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws2 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws3 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_sub32( v128_xor( M[10], H[10] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws4 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[ 9], H[ 9] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws5 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_xor( M[10], H[10] ) ), \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws6 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_sub32( v128_xor( M[11], H[11] ), \
v128_xor( M[13], H[13] ) ) )
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_add32( v128_xor( M[12], H[12] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws8 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_add32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[13], H[13] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws9 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[14], H[14] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[14], H[14] ) ) )
#define Ws10 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
_mm_xor_si128( M[15], H[15] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 1], H[ 1] ) ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
v128_xor( M[15], H[15] ) ) )
#define Ws11 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 9], H[ 9] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
v128_xor( M[ 0], H[ 0] ) ), \
v128_xor( M[ 2], H[ 2] ) ), \
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
v128_xor( M[ 9], H[ 9] ) ) )
#define Ws12 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[10], H[10] ) ) )
v128_sub32( \
v128_sub32( \
v128_add32( v128_xor( M[ 1], H[ 1] ), \
v128_xor( M[ 3], H[ 3] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[10], H[10] ) ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
_mm_xor_si128( M[11], H[11] ) ) )
v128_add32( \
v128_add32( \
v128_add32( v128_xor( M[ 2], H[ 2] ), \
v128_xor( M[ 4], H[ 4] ) ), \
v128_xor( M[ 7], H[ 7] ) ), \
v128_add32( v128_xor( M[10], H[10] ), \
v128_xor( M[11], H[11] ) ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
_mm_xor_si128( M[12], H[12] ) ) )
v128_sub32( \
v128_add32( \
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
v128_xor( M[ 5], H[ 5] ) ), \
v128_xor( M[ 8], H[ 8] ) ), \
v128_add32( v128_xor( M[11], H[11] ), \
v128_xor( M[12], H[12] ) ) )
#define Ws15 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
_mm_xor_si128( M[13], H[13] ) ) )
v128_sub32( \
v128_sub32( \
v128_sub32( v128_xor( M[12], H[12] ), \
v128_xor( M[ 4], H[4] ) ), \
v128_xor( M[ 6], H[ 6] ) ), \
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
v128_xor( M[13], H[13] ) ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
{
__m128i qt[32], xl, xh; \
v128u64_t qt[32], xl, xh; \
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
qt[10] = v128_add32( ss0( Ws10), H[11] );
qt[11] = v128_add32( ss1( Ws11), H[12] );
qt[12] = v128_add32( ss2( Ws12), H[13] );
qt[13] = v128_add32( ss3( Ws13), H[14] );
qt[14] = v128_add32( ss4( Ws14), H[15] );
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = _mm_xor_si128( xl, _mm_xor_si128(
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
xh = v128_xor( xl, v128_xor(
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm128_rol_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
dH[ 0] = v128_add32(
v128_xor( M[0],
v128_xor( v128_sl32( xh, 5 ),
v128_sr32( qt[16], 5 ) ) ),
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
dH[ 1] = v128_add32(
v128_xor( M[1],
v128_xor( v128_sr32( xh, 7 ),
v128_sl32( qt[17], 8 ) ) ),
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
dH[ 2] = v128_add32(
v128_xor( M[2],
v128_xor( v128_sr32( xh, 5 ),
v128_sl32( qt[18], 5 ) ) ),
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
dH[ 3] = v128_add32(
v128_xor( M[3],
v128_xor( v128_sr32( xh, 1 ),
v128_sl32( qt[19], 5 ) ) ),
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
dH[ 4] = v128_add32(
v128_xor( M[4],
v128_xor( v128_sr32( xh, 3 ),
v128_sl32( qt[20], 0 ) ) ),
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
dH[ 5] = v128_add32(
v128_xor( M[5],
v128_xor( v128_sl32( xh, 6 ),
v128_sr32( qt[21], 6 ) ) ),
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
dH[ 6] = v128_add32(
v128_xor( M[6],
v128_xor( v128_sr32( xh, 4 ),
v128_sl32( qt[22], 6 ) ) ),
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
dH[ 7] = v128_add32(
v128_xor( M[7],
v128_xor( v128_sr32( xh, 11 ),
v128_sl32( qt[23], 2 ) ) ),
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
dH[ 8] = v128_add32( v128_add32(
v128_rol32( dH[4], 9 ),
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
v128_xor( v128_sl32( xl, 8 ),
v128_xor( qt[23], qt[ 8] ) ) );
dH[ 9] = v128_add32( v128_add32(
v128_rol32( dH[5], 10 ),
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
v128_xor( v128_sr32( xl, 6 ),
v128_xor( qt[16], qt[ 9] ) ) );
dH[10] = v128_add32( v128_add32(
v128_rol32( dH[6], 11 ),
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
v128_xor( v128_sl32( xl, 6 ),
v128_xor( qt[17], qt[10] ) ) );
dH[11] = v128_add32( v128_add32(
v128_rol32( dH[7], 12 ),
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
v128_xor( v128_sl32( xl, 4 ),
v128_xor( qt[18], qt[11] ) ) );
dH[12] = v128_add32( v128_add32(
v128_rol32( dH[0], 13 ),
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
v128_xor( v128_sr32( xl, 3 ),
v128_xor( qt[19], qt[12] ) ) );
dH[13] = v128_add32( v128_add32(
v128_rol32( dH[1], 14 ),
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
v128_xor( v128_sr32( xl, 4 ),
v128_xor( qt[20], qt[13] ) ) );
dH[14] = v128_add32( v128_add32(
v128_rol32( dH[2], 15 ),
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
v128_xor( v128_sr32( xl, 7 ),
v128_xor( qt[21], qt[14] ) ) );
dH[15] = v128_add32( v128_add32(
v128_rol32( dH[3], 16 ),
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
v128_xor( v128_sr32( xl, 2 ),
v128_xor( qt[22], qt[15] ) ) );
}
static const uint32_t final_s[16][4] =
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const __m128i final_s[16] =
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = _mm_set1_epi32( iv[i] );
// sc->H[i] = v128_32( iv[i] );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
v128u64_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
v128_memcpy( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
__m128i *buf;
__m128i h1[16], h2[16], *h;
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = m128_zero;
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = v128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
compress_small( buf, (v128u64_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
casti_v128( dst, u ) = h1[v];
}
/*

View File

@@ -2,12 +2,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
#if defined(BMW512_8WAY)
void bmw512hash_8way(void *state, const void *input)
void bmw512hash_8way( void *state, const void *input )
{
bmw512_8way_context ctx;
bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
const int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
#elif defined(BMW512_4WAY)
//#ifdef BMW512_4WAY
void bmw512hash_4way(void *state, const void *input)
void bmw512hash_4way( void *state, const void *input )
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input )
{
bmw512_2x64_context ctx;
bmw512_2x64_init( &ctx );
bmw512_2x64_update( &ctx, input, 80 );
bmw512_2x64_close( &ctx, state );
}
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
v128_bswap32_intrlv80_2x64( vdata, pdata );
do {
*noncev = v128_intrlv_blend_32( v128_bswap32(
v128_set32( n+1, 0, n, 0 ) ), *noncev );
bmw512hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -2,7 +2,7 @@
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 256.0;
#if defined (BMW512_8WAY)
gate->scanhash = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
#elif defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#elif defined (BMW512_2WAY)
gate->scanhash = (void*)&scanhash_bmw512_2x64;
gate->hash = (void*)&bmw512hash_2x64;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;

View File

@@ -8,19 +8,27 @@
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define BMW512_2WAY 1
#endif
#if defined(BMW512_8WAY)
void bmw512hash_8way( void *state, const void *input );
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input );
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else

View File

@@ -21,112 +21,92 @@
#include "hash_api.h"
#include "simd-utils.h"
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
const uint32_t const1[] __attribute__ ((aligned (32))) =
{ 0x00000001, 0x00000000, 0x00000000, 0x00000000 };
const uint32_t mul2mask[] __attribute__ ((aligned (16))) =
{ 0x00001b00, 0x00000000, 0x00000000, 0x00000000 };
const uint32_t lsbmask[] __attribute__ ((aligned (16))) =
{ 0x01010101, 0x01010101, 0x01010101, 0x01010101 };
const uint32_t invshiftrows[] __attribute__ ((aligned (16))) =
{ 0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c };
#define ECHO_SUBBYTES4( state, j ) \
state[0][j] = v128_aesenc( state[0][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[1][j] = v128_aesenc( state[1][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[2][j] = v128_aesenc( state[2][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[3][j] = v128_aesenc( state[3][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[0][j] = v128_aesenc_nokey( state[0][j] ); \
state[1][j] = v128_aesenc_nokey( state[1][j] ); \
state[2][j] = v128_aesenc_nokey( state[2][j] ); \
state[3][j] = v128_aesenc_nokey( state[3][j] )
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
#define ECHO_SUBBYTES( state, i, j ) \
state[i][j] = v128_aesenc( state[i][j], k1 ); \
k1 = v128_add32( k1, cast_v128(const1) ); \
state[i][j] = v128_aesenc_nokey( state[i][j] )
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = v128_aesenc(state[0][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[1][j] = v128_aesenc(state[1][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[2][j] = v128_aesenc(state[2][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[3][j] = v128_aesenc(state[3][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
state[3][j] = v128_aesenc(state[3][j], v128_zero )
#define ECHO_SUBBYTES(state, i, j) \
state[i][j] = v128_aesenc(state[i][j], k1);\
k1 = v128_add32(k1, cast_v128(const1));\
state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
s2 = v128_add8(state1[0][j], state1[0][j]);\
t1 = v128_sr16(state1[0][j], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = s2;\
state2[1][j] = state1[0][j];\
state2[2][j] = state1[0][j];\
state2[3][j] = v128_xor(s2, state1[0][j]);\
s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = v128_xor(state2[1][j], s2);\
state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
state2[2][j] = v128_xor(state2[2][j], s2);\
state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
t1 = v128_and(t1, cast_v128(lsbmask));\
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
s2 = v128_xor(s2, t2);\
state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
state2[3][j] = v128_xor(state2[3][j], s2)
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) \
s2 = v128_add8( state1[0][j], state1[0][j] ); \
t1 = v128_sr16( state1[0][j], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = s2; \
state2[1][j] = state1[0][j]; \
state2[2][j] = state1[0][j]; \
state2[3][j] = v128_xor(s2, state1[0][j] ); \
s2 = v128_add8( state1[1][(j + 1) & 3], state1[1][(j + 1) & 3] ); \
t1 = v128_sr16( state1[1][(j + 1) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor3( state2[0][j], s2, state1[1][(j + 1) & 3] );\
state2[1][j] = v128_xor( state2[1][j], s2 ); \
state2[2][j] = v128_xor( state2[2][j], state1[1][(j + 1) & 3] ); \
state2[3][j] = v128_xor( state2[3][j], state1[1][(j + 1) & 3] ); \
s2 = v128_add8( state1[2][(j + 2) & 3], state1[2][(j + 2) & 3] ); \
t1 = v128_sr16( state1[2][(j + 2) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor( state2[0][j], state1[2][(j + 2) & 3] ); \
state2[1][j] = v128_xor3( state2[1][j], s2, state1[2][(j + 2) & 3] ); \
state2[2][j] = v128_xor( state2[2][j], s2 ); \
state2[3][j] = v128_xor( state2[3][j], state1[2][(j + 2) & 3] ); \
s2 = v128_add8( state1[3][(j + 3) & 3], state1[3][(j + 3) & 3] ); \
t1 = v128_sr16( state1[3][(j + 3) & 3], 7 ); \
t1 = v128_and( t1, cast_v128(lsbmask) ); \
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
s2 = v128_xor( s2, t2 ); \
state2[0][j] = v128_xor( state2[0][j], state1[3][(j + 3) & 3] ); \
state2[1][j] = v128_xor( state2[1][j], state1[3][(j + 3) & 3] ); \
state2[2][j] = v128_xor3( state2[2][j], s2, state1[3][(j + 3) & 3] ); \
state2[3][j] = v128_xor( state2[3][j], s2 )
#define ECHO_ROUND_UNROLL2 \
ECHO_SUBBYTES4(_state, 0);\
ECHO_SUBBYTES4(_state, 1);\
ECHO_SUBBYTES4(_state, 2);\
ECHO_SUBBYTES4(_state, 3);\
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
ECHO_SUBBYTES4(_state2, 0);\
ECHO_SUBBYTES4(_state2, 1);\
ECHO_SUBBYTES4(_state2, 2);\
ECHO_SUBBYTES4(_state2, 3);\
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
{ \
ECHO_SUBBYTES4( _state, 0 ); \
ECHO_SUBBYTES4( _state, 1 ); \
ECHO_SUBBYTES4( _state, 2 ); \
ECHO_SUBBYTES4( _state, 3 ); \
ECHO_MIXBYTES( _state, _state2, 0, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 1, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 2, t1, t2, s2 ); \
ECHO_MIXBYTES( _state, _state2, 3, t1, t2, s2 ); \
ECHO_SUBBYTES4( _state2, 0 ); \
ECHO_SUBBYTES4( _state2, 1 ); \
ECHO_SUBBYTES4( _state2, 2 ); \
ECHO_SUBBYTES4( _state2, 3 ); \
ECHO_MIXBYTES( _state2, _state, 0, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 1, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 2, t1, t2, s2 ); \
ECHO_MIXBYTES( _state2, _state, 3, t1, t2, s2 ); \
}
/*
#define ECHO_ROUND_UNROLL2 \
@@ -256,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
}
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
{
int i, j;
@@ -300,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
return SUCCESS;
}
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
HashReturn update_echo( hashState_echo *state, const void *data,
uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -350,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
return SUCCESS;
}
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
HashReturn final_echo( hashState_echo *state, void *hashval)
{
v128_t remainingbits;
@@ -427,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
return SUCCESS;
}
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -550,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
return SUCCESS;
}
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength datalen )
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t datalen )
{
int i, j;
@@ -598,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
data, state->uBlockLength - state->uBufferBytes );
// Process buffer
Compress( state, state->buffer, 1 );
@@ -621,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
if( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
memcpy(state->buffer, data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
@@ -709,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
#if 0
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
HashReturn hRet;
@@ -766,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif
#endif

View File

@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
HashReturn reinit_echo(hashState_echo *state);
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
HashReturn final_echo(hashState_echo *state, void *hashval);
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength databitlen );
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen );
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t databitlen );
#endif // HASH_API_H

View File

@@ -36,7 +36,6 @@
#include "sph_echo.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif
#endif // !AES

View File

@@ -36,8 +36,6 @@
#ifndef SPH_ECHO_H__
#define SPH_ECHO_H__
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
#ifdef __cplusplus
}
#endif
#endif // !AES
#endif

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = mm128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t4 = mm128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = mm128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
ctx->uBlockLength = 4;
for(i = 0; i < 6; i++)
ctx->state[i] = m128_zero;
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,18 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -298,17 +297,16 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -326,10 +324,9 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -431,7 +428,6 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -441,7 +437,6 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -548,7 +543,8 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -577,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -35,8 +35,6 @@
#include "sph_groestl.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif // !AES
#endif

View File

@@ -42,7 +42,6 @@ extern "C"{
#include <stddef.h>
#include "compat/sph_types.h"
#if !defined(__AES__)
/**
* Output size (in bits) for Groestl-224.
*/
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
}
#endif
#endif // !AES
#endif

View File

@@ -35,7 +35,7 @@
#include <stdio.h>
#include "hamsi-hash-4way.h"
static const uint32_t HAMSI_IV512[] =
static const uint32_t HAMSI_IV512[] __attribute__ ((aligned (32))) =
{
0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
@@ -43,7 +43,8 @@ static const uint32_t HAMSI_IV512[] =
0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
};
static const uint32_t alpha_n[] = {
static const uint32_t alpha_n[] __attribute__ ((aligned (32))) =
{
0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
@@ -54,7 +55,8 @@ static const uint32_t alpha_n[] = {
0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
};
static const uint32_t alpha_f[] = {
static const uint32_t alpha_f[] __attribute__ ((aligned (32))) =
{
0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0,
0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
@@ -69,7 +71,8 @@ static const uint32_t alpha_f[] = {
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const uint32_t T512[64][16] = {
static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
{
{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
@@ -1936,7 +1939,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#if defined(__SSE4_2__) || defined(__ARM_NEON)
#define DECL_STATE_2x64 \
v128_t c0, c1, c2, c3, c4, c5, c6, c7; \
v128u64_t c0, c1, c2, c3, c4, c5, c6, c7; \
#define READ_STATE_2x64(sc) \
c0 = sc->h[0]; \
@@ -1960,13 +1963,13 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define INPUT_2x64 \
{ \
v128_t db = *buf; \
const v128_t zero = v128_zero; \
v128u64_t db = *buf; \
const v128u64_t zero = v128_64( 0ull ); \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int i = 63; i >= 0; i-- ) \
{ \
v128_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
v128u64_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
@@ -1982,7 +1985,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
{ \
v128_t tb, td; \
v128u64_t tb, td; \
td = v128_xorand( d, a, c ); \
tb = v128_xoror( b, d, a ); \
c = v128_xor3( c, td, b ); \
@@ -2010,7 +2013,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define ROUND_2x64( alpha ) \
{ \
v128_t t0, t1, t2, t3, t4, t5; \
v128u64_t t0, t1, t2, t3, t4, t5; \
const v128_t mask = v128_64( 0x00000000ffffffff ); \
s0 = v128_xor( s0, alpha[ 0] ); \
s1 = v128_xor( s1, alpha[ 1] ); \
@@ -2107,7 +2110,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define P_2x64 \
{ \
v128_t alpha[16]; \
v128u64_t alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
@@ -2126,7 +2129,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define PF_2x64 \
{ \
v128_t alpha[16]; \
v128u64_t alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
@@ -2193,7 +2196,7 @@ void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
{
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
v128u64_t m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_2x64;
READ_STATE_2x64( sc );
INPUT_2x64;
@@ -2231,15 +2234,15 @@ void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
{
v128_t pad[1];
v128u32_t pad;
uint32_t ch, cl;
ch = bswap_32( sc->count_high );
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
pad = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = v128_64( 0x80 );
hamsi64_big( sc, sc->buf, 1 );
hamsi64_big_final( sc, pad );
hamsi64_big_final( sc, &pad );
v128_block_bswap32( (v128_t*)dst, sc->h );
}
@@ -2260,4 +2263,4 @@ void hamsi512_2x64( void *dst, const void *data, size_t len )
hamsi512_2x64_close( &sc, dst );
}
#endif // SSE4.1 or NEON
#endif // SSE4.2 or NEON

View File

@@ -38,7 +38,7 @@
#include <stddef.h>
#include "simd-utils.h"
// SSE2 or NEON Hamsi-512 2x64
#if defined(__SSE4_2__) || defined(__ARM_NEON)
typedef struct
{
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
size_t len );
void hamsi512_2x64( void *dst, const void *data, size_t len );
#endif
#if defined (__AVX2__)
// Hamsi-512 4x64

View File

@@ -1,183 +0,0 @@
#include <stdint.h>
#include "miner.h"
#if defined(__AES__)
#include <x86intrin.h>
#include "wolf-aes.h"
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
__m128i tmp4;
*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
tmp4 = _mm_slli_si128(*tmp1, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
}
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
{
__m128i tmp2, tmp4;
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
tmp4 = _mm_slli_si128(*tmp3, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
}
// Special thanks to Intel for helping me
// with ExpandAESKey256() and its subroutines
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
{
__m128i tmp1, tmp2, tmp3;
tmp1 = keys[0] = KeyBuf[0];
tmp3 = keys[1] = KeyBuf[1];
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[2] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[3] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[4] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[5] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[6] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[7] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[8] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[9] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[10] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[11] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[12] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[13] = tmp3;
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[14] = tmp1;
}
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AESENC(i,j) \
State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
#define AESENC_N(i) \
AESENC(i,0) \
AESENC(i,1) \
AESENC(i,2) \
AESENC(i,3) \
AESENC(i,4) \
AESENC(i,5) \
AESENC(i,6) \
AESENC(i,7) \
static inline void AES256Core(__m128i* State, __m128i ExpandedKey[][16])
{
const uint32_t N = AES_PARALLEL_N;
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128(State[j], ExpandedKey[j][0]);
}
AESENC_N(1)
AESENC_N(2)
AESENC_N(3)
AESENC_N(4)
AESENC_N(5)
AESENC_N(6)
AESENC_N(7)
AESENC_N(8)
AESENC_N(9)
AESENC_N(10)
AESENC_N(11)
AESENC_N(12)
AESENC_N(13)
for(int j=0; j<N; ++j) {
State[j] = _mm_aesenclast_si128(State[j], ExpandedKey[j][14]);
}
}
void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16], __m128i* IV)
{
const uint32_t N = AES_PARALLEL_N;
__m128i State[N];
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][0], next[j][0]), IV[j]);
}
AES256Core(State, ExpandedKey);
for(int j=0; j<N; ++j) {
data[j][0] = State[j];
}
for(int i = 1; i < BLOCK_COUNT; ++i) {
for(int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][i], next[j][i]), data[j][i - 1]);
}
AES256Core(State, ExpandedKey);
for(int j=0; j<N; ++j) {
data[j][i] = State[j];
}
}
}
#else // NO AVX
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
{
State = _mm_xor_si128(State, ExpandedKey[0]);
for(int i = 1; i < 14; ++i) State = _mm_aesenc_si128(State, ExpandedKey[i]);
return(_mm_aesenclast_si128(State, ExpandedKey[14]));
}
void AES256CBC(__m128i *Ciphertext, const __m128i *Plaintext, const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount)
{
__m128i State = _mm_xor_si128(Plaintext[0], IV);
State = AES256Core(State, ExpandedKey);
Ciphertext[0] = State;
for(int i = 1; i < BlockCount; ++i)
{
State = _mm_xor_si128(Plaintext[i], Ciphertext[i - 1]);
State = AES256Core(State, ExpandedKey);
Ciphertext[i] = State;
}
}
#endif
#endif

View File

@@ -1,75 +0,0 @@
#ifndef HODL_BYTESWAP_H
#define HODL_BYTESWAP_H 1
#define __bswap_constant_16(x) \
((unsigned short int) ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)))
static __inline unsigned short int
__bswap_16 (unsigned short int __bsx)
{
return __bswap_constant_16 (__bsx);
}
// LE
# define htobe16(x) __bswap_16 (x)
# define htole16(x) (x)
# define be16toh(x) __bswap_16 (x)
# define le16toh(x) (x)
// BE
//# define htole16(x) __bswap_16 (x)
//# define htobe16(x) (x)
//# define le16toh(x) __bswap_16 (x)
//# define be16toh(x) (x)
#define __bswap_constant_32(x) \
((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \
(((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
static __inline unsigned int
__bswap_32 (unsigned int __bsx)
{
return __builtin_bswap32 (__bsx);
}
// LE
# define htobe32(x) __bswap_32 (x)
# define htole32(x) (x)
# define be32toh(x) __bswap_32 (x)
# define le32toh(x) (x)
// BE
//# define htole32(x) __bswap_32 (x)
//# define htobe32(x) (x)
//# define le32toh(x) __bswap_32 (x)
//# define be32toh(x) (x)
# define __bswap_constant_64(x) \
((((x) & 0xff00000000000000ull) >> 56) \
| (((x) & 0x00ff000000000000ull) >> 40) \
| (((x) & 0x0000ff0000000000ull) >> 24) \
| (((x) & 0x000000ff00000000ull) >> 8) \
| (((x) & 0x00000000ff000000ull) << 8) \
| (((x) & 0x0000000000ff0000ull) << 24) \
| (((x) & 0x000000000000ff00ull) << 40) \
| (((x) & 0x00000000000000ffull) << 56))
static __inline uint64_t
__bswap_64 (uint64_t __bsx)
{
return __bswap_constant_64 (__bsx);
}
// LE
# define htobe64(x) __bswap_64 (x)
# define htole64(x) (x)
# define be64toh(x) __bswap_64 (x)
# define le64toh(x) (x)
// BE
//# define htole64(x) __bswap_64 (x)
//# define htobe64(x) (x)
//# define le64toh(x) __bswap_64 (x)
//# define be64toh(x) (x)
#endif

View File

@@ -1,185 +0,0 @@
#include <memory.h>
//#include <mm_malloc.h>
#include <stdlib.h>
#include "hodl-gate.h"
#include "hodl-wolf.h"
#define HODL_NSTARTLOC_INDEX 20
#define HODL_NFINALCALC_INDEX 21
static struct work hodl_work;
pthread_barrier_t hodl_barrier;
// All references to this buffer are local to this file, so no args
// need to be passed.
unsigned char *hodl_scratchbuf = NULL;
void hodl_le_build_stratum_request( char* req, struct work* work,
struct stratum_ctx *sctx )
{
uint32_t ntime, nonce, nstartloc, nfinalcalc;
char ntimestr[9], noncestr[9], nstartlocstr[9], nfinalcalcstr[9];
unsigned char *xnonce2str;
le32enc( &ntime, work->data[ algo_gate.ntime_index ] );
le32enc( &nonce, work->data[ algo_gate.nonce_index ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len );
le32enc( &nstartloc, work->data[ HODL_NSTARTLOC_INDEX ] );
le32enc( &nfinalcalc, work->data[ HODL_NFINALCALC_INDEX ] );
bin2hex( nstartlocstr, (char*)(&nstartloc), sizeof(uint32_t) );
bin2hex( nfinalcalcstr, (char*)(&nfinalcalc), sizeof(uint32_t) );
sprintf( req, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr,
nstartlocstr, nfinalcalcstr );
free( xnonce2str );
}
char* hodl_malloc_txs_request( struct work *work )
{
char* req;
json_t *val;
char data_str[2 * sizeof(work->data) + 1];
int i;
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
be32enc( work->data + i, work->data[i] );
bin2hex( data_str, (unsigned char *)work->data, 88 );
if ( work->workid )
{
char *params;
val = json_object();
json_object_set_new( val, "workid", json_string( work->workid ) );
params = json_dumps( val, 0 );
json_decref( val );
req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
data_str, work->txs, params);
free( params );
}
else
{
req = malloc( 128 + 2*88 + strlen(work->txs));
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
data_str, work->txs);
}
return req;
}
void hodl_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_tree,
uint32_t ntime, uint32_t nbits )
{
int i;
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = version;
if ( have_stratum )
for ( i = 0; i < 8; i++ )
g_work->data[ 1+i ] = le32dec( prevhash + i );
else
for (i = 0; i < 8; i++)
g_work->data[ 8-i ] = le32dec( prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[ 9+i ] = be32dec( merkle_tree + i );
g_work->data[ algo_gate.ntime_index ] = ntime;
g_work->data[ algo_gate.nbits_index ] = nbits;
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
}
// called only by thread 0, saves a backup of g_work
void hodl_get_new_work( struct work* work, struct work* g_work)
{
// pthread_rwlock_rdlock( &g_work_lock );
work_free( &hodl_work );
work_copy( &hodl_work, g_work );
hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
// pthread_rwlock_unlock( &g_work_lock );
}
json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
{
json_t *val;
char *req = NULL;
if ( have_gbt )
{
req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
sprintf( req, gbt_lp_req, lp_id );
}
val = json_rpc_call( curl, lp_url, rpc_userpass,
req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
free( req );
return val;
}
// called by every thread, copies the backup to each thread's work.
void hodl_resync_threads( int thr_id, struct work* work )
{
int nonce_index = algo_gate.nonce_index;
pthread_barrier_wait( &hodl_barrier );
if ( memcmp( work->data, hodl_work.data, algo_gate.work_cmp_size ) )
{
work_free( work );
work_copy( work, &hodl_work );
}
work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] );
work_restart[thr_id].restart = 0;
}
bool hodl_do_this_thread( int thr_id )
{
return ( thr_id == 0 );
}
int hodl_scanhash( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
#if defined(__AES__)
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
#endif
return false;
}
bool register_hodl_algo( algo_gate_t* gate )
{
#if !defined(__AES__)
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif
if ( GARBAGE_SIZE % opt_n_threads )
applog( LOG_WARNING,"WARNING: Thread count must be power of 2. Miner may crash or produce invalid hash!" );
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
gate->optimizations = SSE42_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
gate->build_block_header = (void*)&hodl_build_block_header;
gate->resync_threads = (void*)&hodl_resync_threads;
gate->do_this_thread = (void*)&hodl_do_this_thread;
gate->work_cmp_size = 76;
hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
allow_getwork = false;
opt_target_factor = 8388608.0;
return ( hodl_scratchbuf != NULL );
}

View File

@@ -1,6 +0,0 @@
#include "algo-gate-api.h"
extern unsigned char *hodl_scratchbuf;
bool register_hodl_algo ( algo_gate_t* gate );

View File

@@ -1,225 +0,0 @@
#include <string.h>
#include <openssl/evp.h>
#include <openssl/sha.h>
#include "simd-utils.h"
#include "sha512-avx.h"
#include "wolf-aes.h"
#include "hodl-gate.h"
#include "hodl-wolf.h"
#include "miner.h"
#include "algo/sha/sha256d.h"
#if defined(__AES__)
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
void *MidHash )
{
const int Chunk = TOTAL_CHUNKS / ThreadCount;
const uint32_t StartChunk = ThreadID * Chunk;
const uint32_t EndChunk = StartChunk + Chunk;
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
uint64_t* desination[ SHA512_PARALLEL_N ];
for ( int i=0; i < SHA512_PARALLEL_N; ++i )
{
TempBufs[i] = (uint64_t*)malloc( 32 );
memcpy( TempBufs[i], MidHash, 32 );
}
for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
{
for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
{
( (uint32_t*)TempBufs[j] )[0] = i + j;
desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
* GARBAGE_CHUNK_SIZE ) );
}
sha512Compute32b_parallel( TempBufs, desination );
}
for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
free( TempBufs[i] );
#else
uint32_t TempBuf[8];
memcpy( TempBuf, MidHash, 32 );
for ( uint32_t i = StartChunk; i < EndChunk; ++i )
{
TempBuf[0] = i;
SHA512( ( uint8_t *)TempBuf, 32,
( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) );
}
#endif
}
/*
void Rev256(uint32_t *Dest, const uint32_t *Src)
{
for(int i = 0; i < 8; ++i) Dest[i] = swab32(Src[i]);
}
*/
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
#if defined(__SSE4_2__)
//#ifdef __AVX__
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int threadNumber = mythr->id;
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
CacheEntry Cache[AES_PARALLEL_N] __attribute__ ((aligned (64)));
__m128i* data[AES_PARALLEL_N];
const __m128i* next[AES_PARALLEL_N];
uint32_t CollisionCount = 0;
for ( int n=0; n<AES_PARALLEL_N; ++n )
{
data[n] = Cache[n].dqwords;
}
// Search for pattern in psuedorandom data
int searchNumber = COMPARE_SIZE / opt_n_threads;
int startLoc = threadNumber * searchNumber;
for ( int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k += AES_PARALLEL_N )
{
// copy data to first l2 cache
for ( int n=0; n<AES_PARALLEL_N; ++n )
{
memcpy(Cache[n].dwords, Garbage + k + n, GARBAGE_SLICE_SIZE);
}
for(int j = 0; j < AES_ITERATIONS; ++j)
{
__m128i ExpKey[AES_PARALLEL_N][16];
__m128i ivs[AES_PARALLEL_N];
// use last 4 bytes of first cache as next location
for(int n=0; n<AES_PARALLEL_N; ++n) {
uint32_t nextLocation = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
next[n] = Garbage[nextLocation].dqwords;
__m128i last[2];
last[0] = _mm_xor_si128(Cache[n].dqwords[254], next[n][254]);
last[1] = _mm_xor_si128(Cache[n].dqwords[255], next[n][255]);
// Key is last 32b of Cache
// IV is last 16b of Cache
ExpandAESKey256(ExpKey[n], last);
ivs[n] = last[1];
}
AES256CBC(data, next, ExpKey, ivs);
}
for(int n=0; n<AES_PARALLEL_N; ++n)
if((Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1)) < 1000)
{
uint32_t BlockHdr[22], FinalPoW[8];
swab32_array( BlockHdr, pdata, 20 );
BlockHdr[20] = k + n;
BlockHdr[21] = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 2];
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
CollisionCount++;
if( FinalPoW[7] <= ptarget[7] )
{
pdata[20] = swab32( BlockHdr[20] );
pdata[21] = swab32( BlockHdr[21] );
*hashes_done = CollisionCount;
submit_solution( work, FinalPoW, mythr );
return(0);
}
}
}
*hashes_done = CollisionCount;
return(0);
#else // no AVX
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t BlockHdr[22], FinalPoW[8];
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
CacheEntry Cache;
uint32_t CollisionCount = 0;
int threadNumber = mythr->id;
swab32_array( BlockHdr, pdata, 20 );
// Search for pattern in psuedorandom data
int searchNumber = COMPARE_SIZE / opt_n_threads;
int startLoc = threadNumber * searchNumber;
if ( opt_debug )
applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
{
// copy data to first l2 cache
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
for(int j = 0; j < AES_ITERATIONS; j++)
{
CacheEntry TmpXOR;
__m128i ExpKey[16];
// use last 4 bytes of first cache as next location
uint32_t nextLocation = Cache.dwords[(GARBAGE_SLICE_SIZE >> 2)
- 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
// Copy data from indicated location to second l2 cache -
memcpy(&TmpXOR, Garbage + nextLocation, GARBAGE_SLICE_SIZE);
//XOR location data into second cache
for( int i = 0; i < (GARBAGE_SLICE_SIZE >> 4); ++i )
TmpXOR.dqwords[i] = _mm_xor_si128( Cache.dqwords[i],
TmpXOR.dqwords[i] );
// Key is last 32b of TmpXOR
// IV is last 16b of TmpXOR
ExpandAESKey256( ExpKey, TmpXOR.dqwords +
(GARBAGE_SLICE_SIZE / sizeof(__m128i)) - 2 );
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
- 1 ], 256 ); }
// use last X bits as solution
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
& (COMPARE_SIZE - 1) ) < 1000 )
{
BlockHdr[20] = k;
BlockHdr[21] = Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 2 ];
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
CollisionCount++;
if( FinalPoW[7] <= ptarget[7] )
{
pdata[20] = swab32( BlockHdr[20] );
pdata[21] = swab32( BlockHdr[21] );
*hashes_done = CollisionCount;
submit_solution( work, FinalPoW, mythr );
return(0);
}
}
}
*hashes_done = CollisionCount;
return(0);
#endif // AVX else
}
void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
{
uint32_t BlockHdr[20], MidHash[8];
swab32_array( BlockHdr, pdata, 20 );
sha256d((uint8_t *)MidHash, (uint8_t *)BlockHdr, 80);
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
}
#endif // AES

View File

@@ -1,27 +0,0 @@
#ifndef __HODL_H
#define __HODL_H
#include <stdint.h>
#include "simd-utils.h"
#include "miner.h"
#define AES_ITERATIONS 15
#define GARBAGE_SIZE (1 << 30)
#define GARBAGE_CHUNK_SIZE (1 << 6)
#define GARBAGE_SLICE_SIZE (1 << 12)
#define TOTAL_CHUNKS (1 << 24) // GARBAGE_SIZE / GARBAGE_CHUNK_SIZE
#define COMPARE_SIZE (1 << 18) // GARBAGE_SIZE / GARBAGE_SLICE_SIZE
typedef union _CacheEntry
{
uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
} CacheEntry;
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);
#endif // __HODL_H

View File

@@ -1,208 +0,0 @@
.TH MINERD 1 "March 2016" "cpuminer 2.4.3"
.SH NAME
hodlminer \- CPU miner for Hodlcoin
.SH SYNOPSIS
.B hodlminer
[\fIOPTION\fR]...
.SH DESCRIPTION
.B hodlminer
is a multi-threaded CPU miner for Hodlcoin.
It supports the getwork and getblocktemplate (BIP 22) methods,
as well as the Stratum mining protocol.
.PP
In its normal mode of operation, \fBhodlminer\fR connects to a mining server
(specified with the \fB\-o\fR option), receives work from it and starts hashing.
As soon as a solution is found, it is submitted to the same mining server,
which can accept or reject it.
When using getwork or getblocktemplate,
\fBhodlminer\fR can take advantage of long polling, if the server supports it;
in any case, fresh work is fetched as needed.
When using the Stratum protocol this is not possible,
and the server is responsible for sending fresh work at least every minute;
if it fails to do so,
\fBhodlminer\fR may drop the connection and try reconnecting again.
.PP
By default, \fBhodlminer\fR writes all its messages to standard error.
On systems that have a syslog, the \fB\-\-syslog\fR option can be used
to write to it instead.
.PP
On start, the nice value of all miner threads is set to 19.
On Linux, the scheduling policy is also changed to SCHED_IDLE,
or to SCHED_BATCH if that fails.
On multiprocessor systems, \fBhodlminer\fR
automatically sets the CPU affinity of miner threads
if the number of threads is a multiple of the number of processors.
.SH EXAMPLES
To connect to the Hodlcoin mining pool that provides a Stratum server
at hodl.blockquarry.com on port 8332, authenticating as worker "user.worker" with password "x":
.PP
.nf
.RS
hodlminer \-o stratum+tcp://hodl.blockquarry.com:8332 \-u user.worker -p x -q
.RE
.fi
.PP
To mine to a local Hodlcoin instance running on port 18332,
authenticating with username "rpcuser" and password "rpcpass":
.PP
.nf
.RS
hodlminer \-a hodl \-o http://localhost:18332 \-O rpcuser:rpcpass \\
\-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt
.RE
.fi
.PP
.SH OPTIONS
.TP
\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR
Set the hashing algorithm to use.
Default is hodl.
Possible values are:
.RS 11
.TP 10
.B hodl
.TP
\fB\-\-benchmark\fR
Run in offline benchmark mode.
.TP
\fB\-B\fR, \fB\-\-background\fR
Run in the background as a daemon.
.TP
\fB\-\-cert\fR=\fIFILE\fR
Set an SSL certificate to use with the mining server.
Only supported when using the HTTPS protocol.
.TP
\fB\-\-coinbase\-addr\fR=\fIADDRESS\fR
Set a payout address for solo mining.
This is only used in getblocktemplate mode,
and only if the server does not provide a coinbase transaction.
.TP
\fB\-\-coinbase\-sig\fR=\fITEXT\fR
Set a string to be included in the coinbase (if allowed by the server).
This is only used in getblocktemplate mode.
.TP
\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR
Load options from a configuration file.
\fIFILE\fR must contain a JSON object
mapping long options to their arguments (as strings),
or to \fBtrue\fR if no argument is required.
Sample configuration file:
.nf
{
"url": "stratum+tcp://hodl.blockquarry.com:8332",
"userpass": "foo:bar",
"retry-pause": "10",
"quiet": true
}
.fi
.TP
\fB\-D\fR, \fB\-\-debug\fR
Enable debug output.
.TP
\fB\-h\fR, \fB\-\-help\fR
Print a help message and exit.
.TP
\fB\-\-no\-gbt\fR
Do not use the getblocktemplate RPC method.
.TP
\fB\-\-no\-getwork\fR
Do not use the getwork RPC method.
.TP
\fB\-\-no\-longpoll\fR
Do not use long polling.
.TP
\fB\-\-no\-redirect\fR
Ignore requests from the server to switch to a different URL.
.TP
\fB\-\-no\-stratum\fR
Do not switch to Stratum, even if the server advertises support for it.
.TP
\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR]
Set the URL of the mining server to connect to.
Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR
and \fBstratum+tcps\fR.
If no scheme is specified, http is assumed.
Specifying a \fIPATH\fR is only supported for HTTP and HTTPS.
Specifying credentials has the same effect as using the \fB\-O\fR option.
By default, on HTTP and HTTPS,
the miner tries to use the getblocktemplate RPC method,
and falls back to using getwork if getblocktemplate is unavailable.
This behavior can be modified by using the \fB\-\-no\-gbt\fR
and \fB\-\-no\-getwork\fR options.
.TP
\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR
Set the credentials to use for connecting to the mining server.
Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded.
.TP
\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR
Set the password to use for connecting to the mining server.
Any password previously set with \fB\-O\fR is discarded.
.TP
\fB\-P\fR, \fB\-\-protocol\-dump\fR
Enable output of all protocol-level activities.
.TP
\fB\-q\fR, \fB\-\-quiet\fR
Disable per-thread hashmeter output.
.TP
\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR
Set the maximum number of times to retry if a network call fails.
If not specified, the miner will retry indefinitely.
.TP
\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR
Set how long to wait between retries. Default is 30 seconds.
.TP
\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR
Set an upper bound on the time the miner can go without fetching fresh work.
This setting has no effect in Stratum mode or when long polling is activated.
Default is 5 seconds.
.TP
\fB\-S\fR, \fB\-\-syslog\fR
Log to the syslog facility instead of standard error.
.TP
\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR
Set the number of miner threads.
If not specified, the miner will try to detect the number of available processors
and use that.
.TP
\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR
Set a timeout for long polling.
.TP
\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR
Set the username to use for connecting to the mining server.
Any username previously set with \fB\-O\fR is discarded.
.TP
\fB\-V\fR, \fB\-\-version\fR
Display version information and quit.
.TP
\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR
Connect to the mining server through a proxy.
Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR.
Since libcurl 7.18.0, the following are also supported:
\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving).
If no scheme is specified, the proxy is treated as an HTTP proxy.
.SH ENVIRONMENT
The following environment variables can be specified in lower case or upper case;
the lower-case version has precedence. \fBhttp_proxy\fR is an exception
as it is only available in lower case.
.PP
.RS
.TP
\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTP.
.TP
\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use for HTTPS.
.TP
\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
Sets the proxy server to use if no protocol-specific proxy is set.
.RE
.PP
Using an environment variable to set the proxy has the same effect as
using the \fB\-x\fR option.
.SH AUTHOR
Most of the code in the current version of minerd was written by
Pooler <pooler@litecoinpool.org> with contributions from others.
The original minerd was written by Jeff Garzik <jeff@garzik.org>.

View File

@@ -1,50 +0,0 @@
#ifndef _SHA512_H
#define _SHA512_H
#include <stdint.h>
#include "simd-utils.h"
//SHA-512 block size
#define SHA512_BLOCK_SIZE 128
//SHA-512 digest size
#define SHA512_DIGEST_SIZE 64
/*
#ifndef __AVX2__
#ifndef __AVX__
#error "Either AVX or AVX2 supported needed"
#endif // __AVX__
#endif // __AVX2__
*/
typedef struct
{
#ifdef __AVX2__
__m256i h[8];
__m256i w[80];
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
v128_t h[8];
v128_t w[80];
#else
int dummy;
#endif
} Sha512Context;
#ifdef __AVX2__
#define SHA512_PARALLEL_N 8
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
#define SHA512_PARALLEL_N 4
#else
#define SHA512_PARALLEL_N 1 // dummy value
#endif
//SHA-512 related functions
void sha512Compute32b_parallel(
uint64_t *data[SHA512_PARALLEL_N],
uint64_t *digest[SHA512_PARALLEL_N]);
void sha512ProcessBlock(Sha512Context contexti[2] );
#endif

View File

@@ -1,235 +0,0 @@
#ifndef __AVX2__
#if defined(__SSE4_2__)
//#ifdef __AVX__
//Dependencies
#include <string.h>
#include <stdlib.h>
#ifdef __FreeBSD__
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "sha512-avx.h"
#if ((defined(_WIN64) || defined(__WINDOWS__)))
#include "hodl-endian.h"
#endif
//SHA-512 auxiliary functions
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
//Rotate right operation
#define ROR64(a, n) _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(a, 64 - n))
//Shift right operation
#define SHR64(a, n) _mm_srli_epi64(a, n)
__m128i mm_htobe_epi64(__m128i a) {
__m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
return _mm_shuffle_epi8(a, mask);
}
__m128i mm_betoh_epi64(__m128i a) {
return mm_htobe_epi64(a);
}
//SHA-512 padding
static const uint8_t padding[128] =
{
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
//SHA-512 constants
static const uint64_t k[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
Sha512Context context[2];
context[0].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
context[0].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
context[0].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
context[0].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
context[0].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
context[0].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
context[0].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
context[0].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
context[1].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
context[1].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
context[1].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
context[1].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
context[1].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
context[1].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
context[1].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
context[1].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
for(int i=0; i<4; ++i) {
context[0].w[i] = _mm_set_epi64x ( data[1][i], data[0][i] );
context[1].w[i] = _mm_set_epi64x ( data[3][i], data[2][i] );
}
for(int i=0; i<10; ++i) {
context[0].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
context[1].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
}
//Length of the original message (before padding)
uint64_t totalSize = 32 * 8;
//Append the length of the original message
context[0].w[14] = _mm_set1_epi64x(0);
context[0].w[15] = _mm_set1_epi64x(htobe64(totalSize));
context[1].w[14] = _mm_set1_epi64x(0);
context[1].w[15] = _mm_set1_epi64x(htobe64(totalSize));
//Calculate the message digest
sha512ProcessBlock(context);
//Convert from host byte order to big-endian byte order
for (int i = 0; i < 8; i++) {
context[0].h[i] = mm_htobe_epi64(context[0].h[i]);
context[1].h[i] = mm_htobe_epi64(context[1].h[i]);
}
//Copy the resulting digest
for(int i=0; i<8; ++i) {
digest[0][i] = _mm_extract_epi64(context[0].h[i], 0);
digest[1][i] = _mm_extract_epi64(context[0].h[i], 1);
digest[2][i] = _mm_extract_epi64(context[1].h[i], 0);
digest[3][i] = _mm_extract_epi64(context[1].h[i], 1);
}
}
#define blk0(n, i) (block[n][i] = mm_betoh_epi64(block[n][i]))
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
SIGMA4(block[n][i - 2]) + block[n][i - 7])
#define ROUND512(a,b,c,d,e,f,g,h) \
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
(d[0]) += T0; \
(d[1]) += T1; \
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
i++
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
T0 = blk0(0, i); \
T1 = blk0(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
T0 = blk(0, i); \
T1 = blk(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define R512_0 \
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
#define R512_16 \
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
#define INIT(x,n) \
x[0] = context[0].h[n]; \
x[1] = context[1].h[n]; \
void sha512ProcessBlock(Sha512Context context[2])
{
__m128i* block[2];
block[0] = context[0].w;
block[1] = context[1].w;
__m128i T0, T1;
__m128i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
INIT(a, 0)
INIT(b, 1)
INIT(c, 2)
INIT(d, 3)
INIT(e, 4)
INIT(f, 5)
INIT(g, 6)
INIT(h, 7)
int i = 0;
R512_0; R512_0;
for(int j=0; j<8; ++j) {
R512_16;
}
context[0].h[0] += a[0];
context[0].h[1] += b[0];
context[0].h[2] += c[0];
context[0].h[3] += d[0];
context[0].h[4] += e[0];
context[0].h[5] += f[0];
context[0].h[6] += g[0];
context[0].h[7] += h[0];
context[1].h[0] += a[1];
context[1].h[1] += b[1];
context[1].h[2] += c[1];
context[1].h[3] += d[1];
context[1].h[4] += e[1];
context[1].h[5] += f[1];
context[1].h[6] += g[1];
context[1].h[7] += h[1];
}
#endif // __AVX__
#endif // __AVX2__

View File

@@ -1,241 +0,0 @@
#ifdef __AVX2__
//Dependencies
#include <string.h>
#include <stdlib.h>
#ifdef __FreeBSD__
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "immintrin.h"
#include "sha512-avx.h"
#if ((defined(_WIN64) || defined(__WINDOWS__)))
#include "hodl-endian.h"
#endif
//SHA-512 auxiliary functions
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
//Rotate right operation
#define ROR64(a, n) _mm256_or_si256(_mm256_srli_epi64(a, n), _mm256_slli_epi64(a, 64 - n))
//Shift right operation
#define SHR64(a, n) _mm256_srli_epi64(a, n)
__m256i mm256_htobe_epi64(__m256i a) {
__m256i mask = _mm256_set_epi8(
24,25,26,27,28,29,30,31,
16,17,18,19,20,21,22,23,
8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7);
return _mm256_shuffle_epi8(a, mask);
}
__m256i mm256_betoh_epi64(__m256i a) {
return mm256_htobe_epi64(a);
}
//SHA-512 padding
static const uint8_t padding[128] =
{
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
//SHA-512 constants
static const uint64_t k[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
Sha512Context context[2];
context[0].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
context[0].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
context[0].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
context[0].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
context[0].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
context[0].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
context[0].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
context[0].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
context[1].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
context[1].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
context[1].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
context[1].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
context[1].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
context[1].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
context[1].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
context[1].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
for(int i=0; i<4; ++i) {
context[0].w[i] = _mm256_set_epi64x ( data[3][i], data[2][i], data[1][i], data[0][i] );
context[1].w[i] = _mm256_set_epi64x ( data[7][i], data[6][i], data[5][i], data[4][i] );
}
for(int i=0; i<10; ++i) {
context[0].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
context[1].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
}
//Length of the original message (before padding)
uint64_t totalSize = 32 * 8;
//Append the length of the original message
context[0].w[14] = _mm256_set1_epi64x(0);
context[0].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
context[1].w[14] = _mm256_set1_epi64x(0);
context[1].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
//Calculate the message digest
sha512ProcessBlock(context);
//Convert from host byte order to big-endian byte order
for (int i = 0; i < 8; i++) {
context[0].h[i] = mm256_htobe_epi64(context[0].h[i]);
context[1].h[i] = mm256_htobe_epi64(context[1].h[i]);
}
//Copy the resulting digest
for(int i=0; i<8; ++i) {
digest[0][i] = _mm256_extract_epi64(context[0].h[i], 0);
digest[1][i] = _mm256_extract_epi64(context[0].h[i], 1);
digest[2][i] = _mm256_extract_epi64(context[0].h[i], 2);
digest[3][i] = _mm256_extract_epi64(context[0].h[i], 3);
digest[4][i] = _mm256_extract_epi64(context[1].h[i], 0);
digest[5][i] = _mm256_extract_epi64(context[1].h[i], 1);
digest[6][i] = _mm256_extract_epi64(context[1].h[i], 2);
digest[7][i] = _mm256_extract_epi64(context[1].h[i], 3);
}
}
#define blk0(n, i) (block[n][i] = mm256_betoh_epi64(block[n][i]))
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
SIGMA4(block[n][i - 2]) + block[n][i - 7])
#define ROUND512(a,b,c,d,e,f,g,h) \
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
(d[0]) += T0; \
(d[1]) += T1; \
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
i++
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
T0 = blk0(0, i); \
T1 = blk0(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
T0 = blk(0, i); \
T1 = blk(1, i); \
ROUND512(a,b,c,d,e,f,g,h)
#define R512_0 \
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
#define R512_16 \
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
#define INIT(x,n) \
x[0] = context[0].h[n]; \
x[1] = context[1].h[n]; \
void sha512ProcessBlock(Sha512Context context[2])
{
__m256i* block[2];
block[0] = context[0].w;
block[1] = context[1].w;
__m256i T0, T1;
__m256i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
INIT(a, 0)
INIT(b, 1)
INIT(c, 2)
INIT(d, 3)
INIT(e, 4)
INIT(f, 5)
INIT(g, 6)
INIT(h, 7)
int i = 0;
R512_0; R512_0;
for(int j=0; j<8; ++j) {
R512_16;
}
context[0].h[0] += a[0];
context[0].h[1] += b[0];
context[0].h[2] += c[0];
context[0].h[3] += d[0];
context[0].h[4] += e[0];
context[0].h[5] += f[0];
context[0].h[6] += g[0];
context[0].h[7] += h[0];
context[1].h[0] += a[1];
context[1].h[1] += b[1];
context[1].h[2] += c[1];
context[1].h[3] += d[1];
context[1].h[4] += e[1];
context[1].h[5] += f[1];
context[1].h[6] += g[1];
context[1].h[7] += h[1];
}
#endif // __AVX2__

View File

@@ -1,25 +0,0 @@
#ifndef __WOLF_AES_H
#define __WOLF_AES_H
#include <stdint.h>
#include "simd-utils.h"
void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
#if defined(__SSE4_2__)
//#ifdef __AVX__
#define AES_PARALLEL_N 8
#define BLOCK_COUNT 256
void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
v128_t* IV );
#else
void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
#endif
#endif // __WOLF_AES_H

View File

@@ -852,48 +852,10 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
// SSE2 & NEON
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#define v128_notxorandnot( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x2d )
#else
#define v128_notxorandnot( a, b, c ) \
v128_xor( v128_not( a ), v128_andnot( b, c ) )
#endif
#define Sb(x0, x1, x2, x3, c) \
{ \
v128u64_t cc = v128_64( c ); \
x3 = v128_not( x3 ); \
x0 = v128_xor( x0, v128_andnot( x2, cc ) ); \
tmp = v128_xor( cc, v128_and( x0, x1 ) ); \
x0 = v128_xor( x0, v128_and( x2, x3 ) ); \
x3 = v128_xor( x3, v128_andnot( x1, x2 ) ); \
x1 = v128_xor( x1, v128_and( x0, x2 ) ); \
x2 = v128_xor( x2, v128_andnot( x3, x0 ) ); \
x0 = v128_xor( x0, v128_or( x1, x3 ) ); \
x3 = v128_xor( x3, v128_and( x1, x2 ) ); \
x1 = v128_xor( x1, v128_and( tmp, x0 ) ); \
x2 = v128_xor( x2, tmp ); \
}
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
{ \
x4 = v128_xor( x4, x1 ); \
x5 = v128_xor( x5, x2 ); \
x6 = v128_xor( x6, v128_xor( x3, x0 ) ); \
x7 = v128_xor( x7, x0 ); \
x0 = v128_xor( x0, x5 ); \
x1 = v128_xor( x1, x6 ); \
x2 = v128_xor( x2, v128_xor( x7, x4 ) ); \
x3 = v128_xor( x3, x4 ); \
}
/*
#define Sb(x0, x1, x2, x3, c) \
{ \
const v128u64_t cc = v128_64( c ); \
@@ -920,7 +882,6 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
x2 = v128_xor3( x2, x7, x4 ); \
x3 = v128_xor( x3, x4 ); \
}
*/
#undef Wz
#define Wz(x, c, n) \

View File

@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_2WAY)
void keccakhash_2x64(void *state, const void *input)
{
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
keccakhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ))
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
bool register_sha3d_algo( algo_gate_t* gate )
{
hard_coded_eb = 6;
// opt_extranonce = false;
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
#if defined (KECCAK_8WAY)
#if defined (SHA3D_8WAY)
gate->scanhash = (void*)&scanhash_sha3d_8way;
gate->hash = (void*)&sha3d_hash_8way;
#elif defined (KECCAK_4WAY)
#elif defined (SHA3D_4WAY)
gate->scanhash = (void*)&scanhash_sha3d_4way;
gate->hash = (void*)&sha3d_hash_4way;
#elif defined (SHA3D_2WAY)
gate->scanhash = (void*)&scanhash_sha3d_2x64;
gate->hash = (void*)&sha3d_hash_2x64;
#else
gate->scanhash = (void*)&scanhash_sha3d;
gate->hash = (void*)&sha3d_hash;

View File

@@ -8,6 +8,16 @@
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA3D_2WAY 1
#endif
extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
void keccakhash_8way( void *state, const void *input );
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_2WAY)
void keccakhash_2x64( void *state, const void *input );
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void keccakhash( void *state, const void *input );
int scanhash_keccak( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA3D_8WAY)
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64( void *state, const void *input );
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha3d_hash( void *state, const void *input );
int scanhash_sha3d( struct work *work, uint32_t max_nonce,

View File

@@ -563,7 +563,7 @@ static void keccak64x2_close( keccak64_ctx_v128 *kc, void *dst,
{
unsigned eb;
union {
v128_t tmp[lim + 1];
v128_t tmp[140];
uint64_t dummy; /* for alignment */
} u;
size_t j;

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)
#if defined(SHA3D_8WAY)
void sha3d_hash_8way(void *state, const void *input)
{
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_4WAY)
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way(void *state, const void *input)
{
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, buffer );
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, buffer, 32 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
sha3d_hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -33,43 +33,39 @@
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
v128_t b = _mm_shuffle_epi32( a1, 0 ); \
b = v128_xor( a0, v128_mask32( b, 0x4 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
}
#elif defined(__ARM_NEON)
const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) do \
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
}
#endif
@@ -79,16 +75,16 @@ const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
v128_t t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a1 = v128_xnor( a1, a0 ); \
a0 = t; \
}
@@ -137,8 +133,8 @@ const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
t0 = v128_shufll32( a1 ); \
a1 = v128_unpacklo32( t0, a0 ); \
t0 = v128_unpackhi32( t0, a0 ); \
t1 = v128_swap64( t0 ); \
a0 = v128_swap64( a1 ); \
t1 = v128_rev64( t0 ); \
a0 = v128_rev64( a1 ); \
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = v128_unpacklo32( t0, t1 ); \
a1 = v128_unpacklo32( a1, a0 ); \
@@ -224,9 +220,10 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
};
v128_t CNS128[32];
static v128_t CNS128[32];
#if !defined(__SSE4_1__)
v128_t MASK;
static v128_t MASK;
#endif
int init_luffa(hashState_luffa *state, int hashbitlen)
@@ -235,13 +232,13 @@ int init_luffa(hashState_luffa *state, int hashbitlen)
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
MASK = v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
for ( i=0; i<10; i++ )
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
memset(state->buffer, 0, sizeof state->buffer );
return 0;
}
@@ -268,7 +265,7 @@ int update_luffa( hashState_luffa *state, const void *data,
// remaining data bytes
casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
// padding of partial block
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
}
return 0;
@@ -327,7 +324,6 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
return 0;
}
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen )
{
@@ -336,13 +332,13 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= v128_set64( 0, 0x00000000ffffffff );
MASK= v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
CNS128[i] = casti_v128( CNS_INIT, i );
for ( i=0; i<10; i++ )
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
state->chainv[i] = casti_v128( IV, i );
memset(state->buffer, 0, sizeof state->buffer );
// update
@@ -376,16 +372,15 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
return 0;
}
/***************************************************/
/* Round function */
/* state: hash context */
static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
{
v128_t t0, t1;
v128_t *chainv = state->chainv;
v128_t x0, x1, x2, x3, x4, x5, x6, x7;
v128u32_t t0, t1;
v128u32_t *chainv = state->chainv;
v128u32_t x0, x1, x2, x3, x4, x5, x6, x7;
t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
@@ -472,7 +467,7 @@ static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
chainv[5] = v128_rol32( chainv[5], 2 );
chainv[7] = v128_rol32( chainv[7], 3 );
chainv[9] = v128_rol32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

View File

@@ -11,7 +11,7 @@
#endif
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#if !defined(__AES__) // && !defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#endif
@@ -19,7 +19,7 @@
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1
#elif #defined(__SSE2__) || defined(__ARM_NEON)
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define ALLIUM_4WAY 1
#endif
@@ -30,7 +30,7 @@ typedef union {
cube_4way_2buf_context cube;
skein256_8way_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
groestl256_4way_context groestl;
#else
hashState_groestl256 groestl;
#endif
@@ -465,11 +465,7 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
#if defined(__x86_64__)
skein256_2x64_context skein;
#else
sph_skein512_context skein;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,20 +522,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );

View File

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -1,6 +1,8 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -296,8 +298,14 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
@@ -307,6 +315,6 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif

View File

@@ -4,367 +4,273 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake1, blake2;
sph_bmw512_context bmw1, bmw2, bmw3;
sph_skein512_context skein1, skein2;
sph_jh512_context jh1, jh2;
sph_keccak512_context keccak1, keccak2;
hashState_luffa luffa1, luffa2;
cubehashParam cube;
sph_shavite512_context shavite1, shavite2;
#if defined(__aarch64__)
sph_simd512_context simd1, simd2;
#else
hashState_sd simd1, simd2;
#endif
sph_hamsi512_context hamsi1;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
sph_sha512_context sha1, sha2;
sph_haval256_5_context haval1, haval2;
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
hashState_fugue fugue1, fugue2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
sph_fugue512_context fugue1, fugue2;
#endif
} hmq1725_ctx_holder;
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
void init_hmq1725_ctx()
union _hmq1725_ctx_holder
{
sph_blake512_init(&hmq1725_ctx.blake1);
sph_blake512_init(&hmq1725_ctx.blake2);
sph_bmw512_init(&hmq1725_ctx.bmw1);
sph_bmw512_init(&hmq1725_ctx.bmw2);
sph_bmw512_init(&hmq1725_ctx.bmw3);
sph_skein512_init(&hmq1725_ctx.skein1);
sph_skein512_init(&hmq1725_ctx.skein2);
sph_jh512_init(&hmq1725_ctx.jh1);
sph_jh512_init(&hmq1725_ctx.jh2);
sph_keccak512_init(&hmq1725_ctx.keccak1);
sph_keccak512_init(&hmq1725_ctx.keccak2);
init_luffa( &hmq1725_ctx.luffa1, 512 );
init_luffa( &hmq1725_ctx.luffa2, 512 );
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
sph_shavite512_init(&hmq1725_ctx.shavite1);
sph_shavite512_init(&hmq1725_ctx.shavite2);
#if defined(__aarch64__)
sph_simd512_init(&hmq1725_ctx.simd1);
sph_simd512_init(&hmq1725_ctx.simd2);
#else
init_sd( &hmq1725_ctx.simd1, 512 );
init_sd( &hmq1725_ctx.simd2, 512 );
#endif
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
#if defined(__AES__)
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_init(&hmq1725_ctx.fugue1);
sph_fugue512_init(&hmq1725_ctx.fugue2);
sph_fugue512_context fugue;
#endif
sph_shabal512_init(&hmq1725_ctx.shabal1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
sph_sha512_init( &hmq1725_ctx.sha1 );
sph_sha512_init( &hmq1725_ctx.sha2 );
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
}
void hmq_bmw512_midstate( const void* input )
{
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
sph_bmw512( &hmq_bmw_mid, input, 64 );
}
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha;
sph_haval256_5_context haval;
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
uint32_t hashA[32] __attribute__((aligned(32)));
uint32_t hashB[32] __attribute__((aligned(32)));
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hashA ); //1
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
sph_groestl512_close( &ctx.groestl, hashA ); //2
#endif
}
else
{
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
sph_skein512_close(&h_ctx.skein1, hashA); //2
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hashB, 64 ); //1
sph_skein512_close( &ctx.skein, hashA ); //2
}
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
sph_jh512_close(&h_ctx.jh1, hashB); //4
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashA, 64 ); //3
sph_jh512_close( &ctx.jh, hashB ); //4
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
sph_keccak512_close( &ctx.keccak, hashA ); //3
if ( hashA[0] & mask ) //4
{
sph_blake512 (&h_ctx.blake1, hashA, 64); //
sph_blake512_close(&h_ctx.blake1, hashB); //5
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
}
else
{
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
sph_bmw512_close( &ctx.bmw, hashB ); //5
}
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
if ( hashB[0] & mask ) //7
{
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //
sph_keccak512_close( &ctx.keccak, hashA ); //8
}
else
{
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
sph_jh512_close(&h_ctx.jh2, hashA); //8
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashB, 64 ); //7
sph_jh512_close( &ctx.jh, hashA ); //8
}
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
sph_shavite512_close( &ctx.shavite, hashB ); //4
#if defined(__aarch64__)
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
sph_simd512_close(&h_ctx.simd1, hashA); //4
#else
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#endif
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
if ( hashA[0] & mask ) //4
{
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
else
{
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset(&hashB[8], 0, 32);
}
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashB, 64 ); //5
sph_echo512_close( &ctx.echo, hashA ); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
sph_blake512_close(&h_ctx.blake2, hashB); //7
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
if ( hashB[0] & mask ) //7
{
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashB, 64 ); //
sph_shavite512_close( &ctx.shavite, hashA ); //8
}
else
{
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
}
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
fugue512_Final( &h_ctx.fugue1, hashA ); //3
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
sph_fugue512_close( &ctx.fugue, hashA ); //3
#endif
if ( hashA[0] & mask ) //4
{
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashA, 64 ); //
sph_echo512_close( &ctx.echo, hashB ); //5
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&h_ctx.simd2, hashA, 64); //6
sph_simd512_close(&h_ctx.simd2, hashB); //7
#else
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
sph_shabal512_close( &ctx.shabal, hashA ); //6
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
fugue512_Final( &h_ctx.fugue2, hashA ); //8
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //
sph_fugue512_close( &ctx.fugue, hashA ); //8
#endif
}
else
{
sph_sha512( &h_ctx.sha1, hashB, 64 );
sph_sha512_close( &h_ctx.sha1, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
sph_groestl512_close( &ctx.groestl, hashB ); //4
#endif
sph_sha512( &h_ctx.sha2, hashB, 64 );
sph_sha512_close( &h_ctx.sha2, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
if ( hashA[0] & mask ) //4
{
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
memset(&hashB[8], 0, 32);
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset( &hashB[8], 0, 32 );
}
else
{
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
sph_bmw512_close( &ctx.bmw, hashA ); //6
memcpy(state, hashA, 32);
memcpy( state, hashA, 32 );
}
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

View File

@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,13 +8,9 @@
#include <stdio.h>
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
#ifdef __AES__
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
init_luffa(&qubit_ctx.luffa,512);
cubehashInit(&qubit_ctx.cubehash,512,16,32);
sph_shavite512_init(&qubit_ctx.shavite);
#if defined(__aarch64__)
sph_simd512_init( &qubit_ctx.simd );
#else
init_sd( &qubit_ctx.simd, 512 );
#endif
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_echo(&qubit_ctx.echo, 512);
#else
sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
#ifdef __AES__
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512 );
#else

View File

@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
c = v128_rol32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \

View File

@@ -35,41 +35,47 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] = {
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
static const uint32_t sha256_initial_state[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#else
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SCRYPT_THROUGHPUT 4
#else
#define SCRYPT_THROUGHPUT 1
#endif
// static int scrypt_throughput = 0;
static const uint32_t sha256_initial_state[8] __attribute((aligned(32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static int scratchbuf_size = 0;
static __thread uint32_t *scratchbuf = NULL;
#if (SCRYPT_THROUGHPUT == 1) || defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static const uint32_t keypad[12] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] __attribute((aligned(16))) =
{
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
// change this to a constant to be used directly as input state arg
// vectors still need an init function.
static inline void sha256_init_state( uint32_t *state )
@@ -155,6 +161,8 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
output[i] = bswap_32( ostate[i] );
}
#endif // throughput 1
//
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
@@ -269,7 +277,8 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
static const uint32_t keypad_4way[4 * 12] = {
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -283,7 +292,8 @@ static const uint32_t keypad_4way[4 * 12] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000280, 0x00000280, 0x00000280, 0x00000280
};
static const uint32_t innerpad_4way[4 * 11] = {
static const uint32_t innerpad_4way[ 4*11 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -296,7 +306,8 @@ static const uint32_t innerpad_4way[4 * 11] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
};
static const uint32_t outerpad_4way[4 * 8] = {
static const uint32_t outerpad_4way[ 4*8 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,

390
algo/sha/sha1-hash.c Normal file
View File

@@ -0,0 +1,390 @@
#include "simd-utils.h"
#include <stdint.h>
#include "sha1-hash.h"
#if defined(__x86_64__) && defined(__SHA__)
#define sha1_opt_rounds( state_out, data, state_in ) \
{ \
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; \
__m128i MSG0, MSG1, MSG2, MSG3; \
\
ABCD = _mm_load_si128( (const __m128i*) state_in ); \
E0 = _mm_set_epi32( state_in[4], 0, 0, 0 ); \
ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
\
ABCD_SAVE = ABCD; \
E0_SAVE = E0; \
\
/* Rounds 0-3 */ \
MSG0 = load_msg( data, 0 ); \
E0 = _mm_add_epi32( E0, MSG0 ); \
E1 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
\
/* Rounds 4-7 */ \
MSG1 = load_msg( data, 1 ); \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
\
/* Rounds 8-11 */ \
MSG2 = load_msg( data, 2 ); \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 12-15 */ \
MSG3 = load_msg( data, 3 ); \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 0 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 16-19 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 0 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 20-23 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 24-27 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 28-31 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 32-35 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 1 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 36-39 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 1 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 40-43 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 44-47 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 48-51 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 52-55 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 2 ); \
MSG0 = _mm_sha1msg1_epu32( MSG0, MSG1 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 56-59 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 2 ); \
MSG1 = _mm_sha1msg1_epu32( MSG1, MSG2 ); \
MSG0 = _mm_xor_si128( MSG0, MSG2 ); \
\
/* Rounds 60-63 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
MSG0 = _mm_sha1msg2_epu32( MSG0, MSG3 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
MSG2 = _mm_sha1msg1_epu32( MSG2, MSG3 ); \
MSG1 = _mm_xor_si128( MSG1, MSG3 ); \
\
/* Rounds 64-67 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG0 ); \
E1 = ABCD; \
MSG1 = _mm_sha1msg2_epu32( MSG1, MSG0 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
MSG3 = _mm_sha1msg1_epu32( MSG3, MSG0 ); \
MSG2 = _mm_xor_si128( MSG2, MSG0 ); \
\
/* Rounds 68-71 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG1 ); \
E0 = ABCD; \
MSG2 = _mm_sha1msg2_epu32( MSG2, MSG1 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
MSG3 = _mm_xor_si128( MSG3, MSG1 ); \
\
/* Rounds 72-75 */ \
E0 = _mm_sha1nexte_epu32( E0, MSG2 ); \
E1 = ABCD; \
MSG3 = _mm_sha1msg2_epu32( MSG3, MSG2 ); \
ABCD = _mm_sha1rnds4_epu32( ABCD, E0, 3 ); \
\
/* Rounds 76-79 */ \
E1 = _mm_sha1nexte_epu32( E1, MSG3 ); \
E0 = ABCD; \
ABCD = _mm_sha1rnds4_epu32( ABCD, E1, 3 ); \
\
/* Combine state */ \
E0 = _mm_sha1nexte_epu32( E0, E0_SAVE ); \
ABCD = _mm_add_epi32( ABCD, ABCD_SAVE ); \
\
/* Save state */ \
ABCD = _mm_shuffle_epi32( ABCD, 0x1B ); \
_mm_store_si128( (__m128i*) state_out, ABCD ); \
state_out[4] = _mm_extract_epi32( E0, 3 ); \
}
void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
sha1_opt_rounds( state_out, input, state_in );
#undef load_msg
}
void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
const __m128i MASK = _mm_set_epi64x( 0x0001020304050607ULL,
0x08090a0b0c0d0e0fULL );
#define load_msg( m, i ) _mm_shuffle_epi8( casti_v128( m, i ), MASK )
sha1_opt_rounds( state_out, input, state_in );
#undef load_msg
}
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \
uint32x4_t ABCD, ABCD_SAVED; \
uint32x4_t TMP0, TMP1; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32_t E0, E0_SAVED, E1; \
\
/* Load state */ \
ABCD = vld1q_u32( &state_in[0] ); \
E0 = state_in[4]; \
\
/* Save state */ \
ABCD_SAVED = ABCD; \
E0_SAVED = E0; \
\
MSG0 = load_msg( data, 0 ); \
MSG1 = load_msg( data, 1 ); \
MSG2 = load_msg( data, 2 ); \
MSG3 = load_msg( data, 3 ); \
\
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x5A827999 ) ); \
\
/* Rounds 0-3 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x5A827999 ) ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 4-7 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32(ABCD, E1, TMP1); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x5A827999 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 8-11 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x5A827999 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 12-15 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 16-19 */\
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1cq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 20-23 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 24-27 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 28-31 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x6ED9EBA1 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 32-35 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 36-39 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 40-43 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 44-47 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 48-51 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG2, vdupq_n_u32( 0x8F1BBCDC ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 52-55 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
MSG1 = vsha1su0q_u32( MSG1, MSG2, MSG3 ); \
\
/* Rounds 56-59 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1mq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32( MSG0, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG1 = vsha1su1q_u32( MSG1, MSG0 ); \
MSG2 = vsha1su0q_u32( MSG2, MSG3, MSG0 ); \
\
/* Rounds 60-63 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG1, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG2 = vsha1su1q_u32( MSG2, MSG1 ); \
MSG3 = vsha1su0q_u32( MSG3, MSG0, MSG1 ); \
\
/* Rounds 64-67 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
TMP0 = vaddq_u32(MSG2, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG3 = vsha1su1q_u32( MSG3, MSG2 ); \
MSG0 = vsha1su0q_u32( MSG0, MSG1, MSG2 ); \
\
/* Rounds 68-71 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
TMP1 = vaddq_u32( MSG3, vdupq_n_u32( 0xCA62C1D6 ) ); \
MSG0 = vsha1su1q_u32( MSG0, MSG3 ); \
\
/* Rounds 72-75 */ \
E1 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E0, TMP0 ); \
\
/* Rounds 76-79 */ \
E0 = vsha1h_u32( vgetq_lane_u32( ABCD, 0 ) ); \
ABCD = vsha1pq_u32( ABCD, E1, TMP1 ); \
\
/* Combine state */ \
E0 += E0_SAVED; \
ABCD = vaddq_u32( ABCD_SAVED, ABCD ); \
\
/* Save state */ \
vst1q_u32( &state_out[0], ABCD ); \
state_out[4] = E0; \
}
void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) );
sha1_neon_rounds( state_out, input, state_in );
#undef load_msg
}
void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i );
sha1_neon_rounds( state_out, input, state_in );
#undef load_msg
}
#endif

40
algo/sha/sha1-hash.h Normal file
View File

@@ -0,0 +1,40 @@
#ifndef SHA1_HASH_H__
#define SHA1_HASH_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#include "cpuminer-config.h"
#include "sph_sha1.h"
// SHA hooks for sha1, automaticaaly substituded in SPH
#if defined(__x86_64__) && defined(__SHA__)
void sha1_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
#define sha1_transform_le sha1_x86_sha_transform_le
#define sha1_transform_be sha1_x86_sha_transform_be
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
void sha1_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha1_neon_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
#define sha1_transform_le sha1_neon_sha_transform_le
#define sha1_transform_be sha1_neon_sha_transform_be
#else
#define sha1_transform_le sph_sha1_transform_le
#define sha1_transform_be sph_sha1_transform_be
#endif
#define sha1_full sph_sha1_full
#endif

400
algo/sha/sha1.c Normal file
View File

@@ -0,0 +1,400 @@
/* $Id: sha1.c 216 2010-06-08 09:46:57Z tp $ */
/*
* SHA-1 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "simd-utils.h"
#include "sha1-hash.h"
#define F(B, C, D) ((((C) ^ (D)) & (B)) ^ (D))
#define G(B, C, D) ((B) ^ (C) ^ (D))
#define H(B, C, D) (((D) & (C)) | (((D) | (C)) & (B)))
#define I(B, C, D) G(B, C, D)
#define ROTL rol32
//#define ROTL SPH_ROTL32
#define K1 SPH_C32(0x5A827999)
#define K2 SPH_C32(0x6ED9EBA1)
#define K3 SPH_C32(0x8F1BBCDC)
#define K4 SPH_C32(0xCA62C1D6)
static const sph_u32 IV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
SPH_C32(0x98BADCFE), SPH_C32(0x10325476),
SPH_C32(0xC3D2E1F0)
};
/*
* This macro defines the body for a SHA-1 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "r" parameter should evaluate to
* an array or pointer expression designating the array of 5 words which
* contains the input and output of the compression function.
*/
#define SHA1_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E; \
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
\
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
\
W00 = in(0); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W00 + K1); \
B = ROTL(B, 30); \
W01 = in(1); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W01 + K1); \
A = ROTL(A, 30); \
W02 = in(2); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W02 + K1); \
E = ROTL(E, 30); \
W03 = in(3); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W03 + K1); \
D = ROTL(D, 30); \
W04 = in(4); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W04 + K1); \
C = ROTL(C, 30); \
W05 = in(5); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W05 + K1); \
B = ROTL(B, 30); \
W06 = in(6); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W06 + K1); \
A = ROTL(A, 30); \
W07 = in(7); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W07 + K1); \
E = ROTL(E, 30); \
W08 = in(8); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W08 + K1); \
D = ROTL(D, 30); \
W09 = in(9); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W09 + K1); \
C = ROTL(C, 30); \
W10 = in(10); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W10 + K1); \
B = ROTL(B, 30); \
W11 = in(11); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W11 + K1); \
A = ROTL(A, 30); \
W12 = in(12); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W12 + K1); \
E = ROTL(E, 30); \
W13 = in(13); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W13 + K1); \
D = ROTL(D, 30); \
W14 = in(14); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W14 + K1); \
C = ROTL(C, 30); \
W15 = in(15); \
E = SPH_T32(ROTL(A, 5) + F(B, C, D) + E + W15 + K1); \
B = ROTL(B, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
D = SPH_T32(ROTL(E, 5) + F(A, B, C) + D + W00 + K1); \
A = ROTL(A, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
C = SPH_T32(ROTL(D, 5) + F(E, A, B) + C + W01 + K1); \
E = ROTL(E, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
B = SPH_T32(ROTL(C, 5) + F(D, E, A) + B + W02 + K1); \
D = ROTL(D, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
A = SPH_T32(ROTL(B, 5) + F(C, D, E) + A + W03 + K1); \
C = ROTL(C, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W04 + K2); \
B = ROTL(B, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W05 + K2); \
A = ROTL(A, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W06 + K2); \
E = ROTL(E, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W07 + K2); \
D = ROTL(D, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W08 + K2); \
C = ROTL(C, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W09 + K2); \
B = ROTL(B, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W10 + K2); \
A = ROTL(A, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W11 + K2); \
E = ROTL(E, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W12 + K2); \
D = ROTL(D, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W13 + K2); \
C = ROTL(C, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W14 + K2); \
B = ROTL(B, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W15 + K2); \
A = ROTL(A, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W00 + K2); \
E = ROTL(E, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W01 + K2); \
D = ROTL(D, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W02 + K2); \
C = ROTL(C, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
E = SPH_T32(ROTL(A, 5) + G(B, C, D) + E + W03 + K2); \
B = ROTL(B, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
D = SPH_T32(ROTL(E, 5) + G(A, B, C) + D + W04 + K2); \
A = ROTL(A, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
C = SPH_T32(ROTL(D, 5) + G(E, A, B) + C + W05 + K2); \
E = ROTL(E, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
B = SPH_T32(ROTL(C, 5) + G(D, E, A) + B + W06 + K2); \
D = ROTL(D, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
A = SPH_T32(ROTL(B, 5) + G(C, D, E) + A + W07 + K2); \
C = ROTL(C, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W08 + K3); \
B = ROTL(B, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W09 + K3); \
A = ROTL(A, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W10 + K3); \
E = ROTL(E, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W11 + K3); \
D = ROTL(D, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W12 + K3); \
C = ROTL(C, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W13 + K3); \
B = ROTL(B, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W14 + K3); \
A = ROTL(A, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W15 + K3); \
E = ROTL(E, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W00 + K3); \
D = ROTL(D, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W01 + K3); \
C = ROTL(C, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W02 + K3); \
B = ROTL(B, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W03 + K3); \
A = ROTL(A, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W04 + K3); \
E = ROTL(E, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W05 + K3); \
D = ROTL(D, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W06 + K3); \
C = ROTL(C, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
E = SPH_T32(ROTL(A, 5) + H(B, C, D) + E + W07 + K3); \
B = ROTL(B, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
D = SPH_T32(ROTL(E, 5) + H(A, B, C) + D + W08 + K3); \
A = ROTL(A, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
C = SPH_T32(ROTL(D, 5) + H(E, A, B) + C + W09 + K3); \
E = ROTL(E, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
B = SPH_T32(ROTL(C, 5) + H(D, E, A) + B + W10 + K3); \
D = ROTL(D, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
A = SPH_T32(ROTL(B, 5) + H(C, D, E) + A + W11 + K3); \
C = ROTL(C, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W12 + K4); \
B = ROTL(B, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W13 + K4); \
A = ROTL(A, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W14 + K4); \
E = ROTL(E, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W15 + K4); \
D = ROTL(D, 30); \
W00 = ROTL(W13 ^ W08 ^ W02 ^ W00, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W00 + K4); \
C = ROTL(C, 30); \
W01 = ROTL(W14 ^ W09 ^ W03 ^ W01, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W01 + K4); \
B = ROTL(B, 30); \
W02 = ROTL(W15 ^ W10 ^ W04 ^ W02, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W02 + K4); \
A = ROTL(A, 30); \
W03 = ROTL(W00 ^ W11 ^ W05 ^ W03, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W03 + K4); \
E = ROTL(E, 30); \
W04 = ROTL(W01 ^ W12 ^ W06 ^ W04, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W04 + K4); \
D = ROTL(D, 30); \
W05 = ROTL(W02 ^ W13 ^ W07 ^ W05, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W05 + K4); \
C = ROTL(C, 30); \
W06 = ROTL(W03 ^ W14 ^ W08 ^ W06, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W06 + K4); \
B = ROTL(B, 30); \
W07 = ROTL(W04 ^ W15 ^ W09 ^ W07, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W07 + K4); \
A = ROTL(A, 30); \
W08 = ROTL(W05 ^ W00 ^ W10 ^ W08, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W08 + K4); \
E = ROTL(E, 30); \
W09 = ROTL(W06 ^ W01 ^ W11 ^ W09, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W09 + K4); \
D = ROTL(D, 30); \
W10 = ROTL(W07 ^ W02 ^ W12 ^ W10, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W10 + K4); \
C = ROTL(C, 30); \
W11 = ROTL(W08 ^ W03 ^ W13 ^ W11, 1); \
E = SPH_T32(ROTL(A, 5) + I(B, C, D) + E + W11 + K4); \
B = ROTL(B, 30); \
W12 = ROTL(W09 ^ W04 ^ W14 ^ W12, 1); \
D = SPH_T32(ROTL(E, 5) + I(A, B, C) + D + W12 + K4); \
A = ROTL(A, 30); \
W13 = ROTL(W10 ^ W05 ^ W15 ^ W13, 1); \
C = SPH_T32(ROTL(D, 5) + I(E, A, B) + C + W13 + K4); \
E = ROTL(E, 30); \
W14 = ROTL(W11 ^ W06 ^ W00 ^ W14, 1); \
B = SPH_T32(ROTL(C, 5) + I(D, E, A) + B + W14 + K4); \
D = ROTL(D, 30); \
W15 = ROTL(W12 ^ W07 ^ W01 ^ W15, 1); \
A = SPH_T32(ROTL(B, 5) + I(C, D, E) + A + W15 + K4); \
C = ROTL(C, 30); \
\
(r)[0] = SPH_T32(r[0] + A); \
(r)[1] = SPH_T32(r[1] + B); \
(r)[2] = SPH_T32(r[2] + C); \
(r)[3] = SPH_T32(r[3] + D); \
(r)[4] = SPH_T32(r[4] + E); \
} while (0)
/*
* One round of SHA-1. The data must be aligned for 32-bit access.
*/
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__aarch64__) && defined(__ARM_FEATURE_SHA2) )
static void
sha1_round( const unsigned char *data, sph_u32 r[5] )
{
sha1_transform_be( (uint32_t*)r, (uint32_t*)data, (const uint32_t*)r );
}
#else
static void
sha1_round( const unsigned char *data, sph_u32 r[5] )
{
#define SHA1_IN(x) sph_dec32be_aligned(data + (4 * (x)))
SHA1_ROUND_BODY(SHA1_IN, r);
#undef SHA1_IN
}
#endif
/* see sph_sha1.h */
void
sph_sha1_init(void *cc)
{
sph_sha1_context *sc;
sc = cc;
memcpy(sc->val, IV, sizeof IV);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN sha1_round
#define HASH sha1
#define BE32 1
#include "md_helper.c"
/* see sph_sha1.h */
void
sph_sha1_close(void *cc, void *dst)
{
sha1_close(cc, dst, 5);
sph_sha1_init(cc);
}
/* see sph_sha1.h */
void
sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha1_addbits_and_close(cc, ub, n, dst, 5);
sph_sha1_init(cc);
}
/* see sph_sha1.h */
void
sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5])
{
#define SHA1_IN(x) msg[x]
SHA1_ROUND_BODY(SHA1_IN, val);
#undef SHA1_IN
}
void sph_sha1_full( void *hash, const void *msg, size_t len )
{
sph_sha1_context cc;
sph_sha1_init( &cc );
sph_sha1( &cc, msg, len );
sph_sha1_close( &cc, hash );
}

View File

@@ -1,681 +0,0 @@
/*
* Copyright 2011 ArtForz
* Copyright 2011-2013 pooler
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
#define EXTERN_SHA256
#endif
static const uint32_t sha256_h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static const uint32_t sha256_k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
void sha256_init(uint32_t *state)
{
memcpy(state, sha256_h, 32);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
do { \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + sha256_k[i])
#ifndef EXTERN_SHA256
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
{
uint32_t W[64];
uint32_t S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
if (swap) {
for (i = 0; i < 16; i++)
W[i] = swab32(block[i]);
} else
memcpy(W, block, 64);
for (i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++)
state[i] += S[i];
}
#endif /* EXTERN_SHA256 */
static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
int i;
sha256_init(S);
sha256_transform(S, data, 0);
sha256_transform(S, data + 16, 0);
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(hash);
sha256_transform(hash, S, 0);
for (i = 0; i < 8; i++)
hash[i] = swab32(hash[i]);
}
/*
#if defined (__SHA__)
#include "algo/sha/sph_sha2.h"
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
sph_sha256_context ctx __attribute__ ((aligned (64)));
sph_sha256_init( &ctx );
sph_sha256( &ctx, data, len );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
}
#else
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
for (r = len; r > -9; r -= 64) {
if (r < 64)
memset(T, 0, 64);
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
if (r >= 0 && r < 64)
((unsigned char *)T)[r] = 0x80;
for (i = 0; i < 16; i++)
T[i] = be32dec(T + i);
if (r < 56)
T[15] = 8 * len;
sha256_transform(S, T, 0);
}
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(T);
sha256_transform(T, S, 0);
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
}
#endif
*/
static inline void sha256d_preextend(uint32_t *W)
{
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
W[18] = s1(W[16]) + W[11] + W[ 2];
W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
W[20] = W[13] + s0(W[ 5]) + W[ 4];
W[21] = W[14] + s0(W[ 6]) + W[ 5];
W[22] = W[15] + s0(W[ 7]) + W[ 6];
W[23] = W[16] + s0(W[ 8]) + W[ 7];
W[24] = W[17] + s0(W[ 9]) + W[ 8];
W[25] = s0(W[10]) + W[ 9];
W[26] = s0(W[11]) + W[10];
W[27] = s0(W[12]) + W[11];
W[28] = s0(W[13]) + W[12];
W[29] = s0(W[14]) + W[13];
W[30] = s0(W[15]) + W[14];
W[31] = s0(W[16]) + W[15];
}
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
{
uint32_t t0, t1;
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
}
#ifdef EXTERN_SHA256
void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash);
#else
static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash)
{
uint32_t S[64];
uint32_t t0, t1;
int i;
S[18] = W[18];
S[19] = W[19];
S[20] = W[20];
S[22] = W[22];
S[23] = W[23];
S[24] = W[24];
S[30] = W[30];
S[31] = W[31];
W[18] += s0(W[3]);
W[19] += W[3];
W[20] += s1(W[18]);
W[21] = s1(W[19]);
W[22] += s1(W[20]);
W[23] += s1(W[21]);
W[24] += s1(W[22]);
W[25] = s1(W[23]) + W[18];
W[26] = s1(W[24]) + W[19];
W[27] = s1(W[25]) + W[20];
W[28] = s1(W[26]) + W[21];
W[29] = s1(W[27]) + W[22];
W[30] += s1(W[28]) + W[23];
W[31] += s1(W[29]) + W[24];
for (i = 32; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
memcpy(S, prehash, 32);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
for (i = 0; i < 8; i++)
S[i] += midstate[i];
W[18] = S[18];
W[19] = S[19];
W[20] = S[20];
W[22] = S[22];
W[23] = S[23];
W[24] = S[24];
W[30] = S[30];
W[31] = S[31];
memcpy(S + 8, sha256d_hash1 + 8, 32);
S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
for (i = 32; i < 60; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
}
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
sha256_init(hash);
RNDr(hash, S, 0);
RNDr(hash, S, 1);
RNDr(hash, S, 2);
RNDr(hash, S, 3);
RNDr(hash, S, 4);
RNDr(hash, S, 5);
RNDr(hash, S, 6);
RNDr(hash, S, 7);
RNDr(hash, S, 8);
RNDr(hash, S, 9);
RNDr(hash, S, 10);
RNDr(hash, S, 11);
RNDr(hash, S, 12);
RNDr(hash, S, 13);
RNDr(hash, S, 14);
RNDr(hash, S, 15);
RNDr(hash, S, 16);
RNDr(hash, S, 17);
RNDr(hash, S, 18);
RNDr(hash, S, 19);
RNDr(hash, S, 20);
RNDr(hash, S, 21);
RNDr(hash, S, 22);
RNDr(hash, S, 23);
RNDr(hash, S, 24);
RNDr(hash, S, 25);
RNDr(hash, S, 26);
RNDr(hash, S, 27);
RNDr(hash, S, 28);
RNDr(hash, S, 29);
RNDr(hash, S, 30);
RNDr(hash, S, 31);
RNDr(hash, S, 32);
RNDr(hash, S, 33);
RNDr(hash, S, 34);
RNDr(hash, S, 35);
RNDr(hash, S, 36);
RNDr(hash, S, 37);
RNDr(hash, S, 38);
RNDr(hash, S, 39);
RNDr(hash, S, 40);
RNDr(hash, S, 41);
RNDr(hash, S, 42);
RNDr(hash, S, 43);
RNDr(hash, S, 44);
RNDr(hash, S, 45);
RNDr(hash, S, 46);
RNDr(hash, S, 47);
RNDr(hash, S, 48);
RNDr(hash, S, 49);
RNDr(hash, S, 50);
RNDr(hash, S, 51);
RNDr(hash, S, 52);
RNDr(hash, S, 53);
RNDr(hash, S, 54);
RNDr(hash, S, 55);
RNDr(hash, S, 56);
hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+ S[57] + sha256_k[57];
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+ S[58] + sha256_k[58];
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+ S[59] + sha256_k[59];
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+ S[60] + sha256_k[60]
+ sha256_h[7];
}
#endif /* EXTERN_SHA256 */
#ifdef HAVE_SHA256_4WAY
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[4 * 64];
uint32_t _ALIGN(32) hash[4 * 8];
uint32_t _ALIGN(32) midstate[4 * 8];
uint32_t _ALIGN(32) prehash[4 * 8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 4; j++)
data[i * 4 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 4; j++) {
midstate[i * 4 + j] = midstate[i];
prehash[i * 4 + j] = prehash[i];
}
}
do {
for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n;
sha256d_ms_4way(hash, data, midstate, prehash);
for (i = 0; i < 4; i++) {
if (swab32(hash[4 * 7 + i]) <= Htarg) {
pdata[19] = data[4 * 3 + i];
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[8 * 64];
uint32_t _ALIGN(32) hash[8 * 8];
uint32_t _ALIGN(32) midstate[8 * 8];
uint32_t _ALIGN(32) prehash[8 * 8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 8; j++)
data[i * 8 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 8; j++) {
midstate[i * 8 + j] = midstate[i];
prehash[i * 8 + j] = prehash[i];
}
}
do {
for (i = 0; i < 8; i++)
data[8 * 3 + i] = ++n;
sha256d_ms_8way(hash, data, midstate, prehash);
for (i = 0; i < 8; i++) {
if (swab32(hash[8 * 7 + i]) <= Htarg) {
pdata[19] = data[8 * 3 + i];
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[64];
uint32_t _ALIGN(32) hash[8];
uint32_t _ALIGN(32) midstate[8];
uint32_t _ALIGN(32) prehash[8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
#ifdef HAVE_SHA256_8WAY
if ( sha256_use_8way() )
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if ( sha256_use_4way() )
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
do {
data[3] = ++n;
sha256d_ms(hash, data, midstate, prehash);
if (unlikely(swab32(hash[7]) <= Htarg))
{
pdata[19] = data[3];
sha256d_80_swap(hash, pdata);
if ( fulltest(hash, ptarget) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
} while (likely(n < max_nonce && !work_restart[thr_id].restart));
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
//#elif defined(SHA256D_8WAY)
// gate->scanhash = (void*)&scanhash_sha256d_8way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
// gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; memcpy_128( W, data, 16 );
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );

View File

@@ -587,8 +587,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
@@ -1200,7 +1200,7 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \
/* Rounds 44-47 */ \
MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \
MSG3_Y = vsha256su0q_u32( MSG3_X, MSG0_Y ); \
MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \
TMP2_X = STATE0_X; \
TMP2_Y = STATE0_Y; \
TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 12 ) ); \

View File

@@ -1,9 +1,9 @@
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha256d.h"
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
@@ -383,8 +383,6 @@ int scanhash_sha256d_4x32( struct work *work, const uint32_t max_nonce,
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
memset( block, 0, 16*4*4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );
@@ -412,7 +410,6 @@ int scanhash_sha256d_4x32( struct work *work, const uint32_t max_nonce,
do
{
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
// sha256_4x32_transform_le( block, vdata+16, mhash1 );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )

View File

@@ -1,3 +1,4 @@
#include "sha256-hash.h"
#include "sha256d.h"
void sha256d( void *hash, const void *data, int len )
@@ -5,4 +6,24 @@ void sha256d( void *hash, const void *data, int len )
sha256_full( hash, data, len );
sha256_full( hash, hash, 32 );
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4x32;
#else
gate->hash = (void*)&sha256d;
#endif
return true;
};

View File

@@ -1,7 +1,58 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#include <string.h>
#include <inttypes.h>
#include "sha256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA256D_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4x32( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_SHA)
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_NEON_SHA2)
int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256d( void *hash, const void *data, int len );
bool register_sha256d_algo( algo_gate_t* gate );
#endif

View File

@@ -380,8 +380,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
memset( block, 0, 16*4*4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );

View File

@@ -392,8 +392,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
memset( block, 0, 16*4*4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );

View File

@@ -692,7 +692,7 @@ do { \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
_mm256_xor_si256( Y, _mm256_and_si256( (X_xor_Y = _mm256_xor_si256( X, Y )), \
Y_xor_Z ) )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
@@ -892,7 +892,7 @@ void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
v128_xor( v128_and( v128_xor( Y, Z ), X ), Z )
#define MAJ_2x64(X, Y, Z) \
v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) )
v128_xor( Y, v128_and( (X_xor_Y = v128_xor( X, Y ) ), Y_xor_Z ) )
#define SHA3_2x64_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
@@ -917,34 +917,20 @@ sha512_2x64_round( sha512_2x64_context *ctx, v128u64_t *in, v128u64_t r[8] )
v128u64_t W[80];
v128_block_bswap64( W , in );
v128_block_bswap64( (&W[8]), (&in[8]) );
v128_block_bswap64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = v128_add4_64( SSG5_0_2x64( W[i-15] ), SSG5_1_2x64( W[i-2] ),
W[ i- 7 ], W[ i-16 ] );
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = v128_64( 0x6A09E667F3BCC908 );
B = v128_64( 0xBB67AE8584CAA73B );
C = v128_64( 0x3C6EF372FE94F82B );
D = v128_64( 0xA54FF53A5F1D36F1 );
E = v128_64( 0x510E527FADE682D1 );
F = v128_64( 0x9B05688C2B3E6C1F );
G = v128_64( 0x1F83D9ABFB41BD6B );
H = v128_64( 0x5BE0CD19137E2179 );
}
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
Y_xor_Z = v128_xor( B, C );
@@ -960,35 +946,28 @@ sha512_2x64_round( sha512_2x64_context *ctx, v128u64_t *in, v128u64_t r[8] )
SHA3_2x64_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
if ( ctx->initialized )
{
r[0] = v128_add64( r[0], A );
r[1] = v128_add64( r[1], B );
r[2] = v128_add64( r[2], C );
r[3] = v128_add64( r[3], D );
r[4] = v128_add64( r[4], E );
r[5] = v128_add64( r[5], F );
r[6] = v128_add64( r[6], G );
r[7] = v128_add64( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = v128_add64( A, v128_64( 0x6A09E667F3BCC908 ) );
r[1] = v128_add64( B, v128_64( 0xBB67AE8584CAA73B ) );
r[2] = v128_add64( C, v128_64( 0x3C6EF372FE94F82B ) );
r[3] = v128_add64( D, v128_64( 0xA54FF53A5F1D36F1 ) );
r[4] = v128_add64( E, v128_64( 0x510E527FADE682D1 ) );
r[5] = v128_add64( F, v128_64( 0x9B05688C2B3E6C1F ) );
r[6] = v128_add64( G, v128_64( 0x1F83D9ABFB41BD6B ) );
r[7] = v128_add64( H, v128_64( 0x5BE0CD19137E2179 ) );
}
r[0] = v128_add64( r[0], A );
r[1] = v128_add64( r[1], B );
r[2] = v128_add64( r[2], C );
r[3] = v128_add64( r[3], D );
r[4] = v128_add64( r[4], E );
r[5] = v128_add64( r[5], F );
r[6] = v128_add64( r[6], G );
r[7] = v128_add64( r[7], H );
}
void sha512_2x64_init( sha512_2x64_context *sc )
{
sc->initialized = false;
sc->val[0] = v128_64( 0x6A09E667F3BCC908 );
sc->val[1] = v128_64( 0xBB67AE8584CAA73B );
sc->val[2] = v128_64( 0x3C6EF372FE94F82B );
sc->val[3] = v128_64( 0xA54FF53A5F1D36F1 );
sc->val[4] = v128_64( 0x510E527FADE682D1 );
sc->val[5] = v128_64( 0x9B05688C2B3E6C1F );
sc->val[6] = v128_64( 0x1F83D9ABFB41BD6B );
sc->val[7] = v128_64( 0x5BE0CD19137E2179 );
sc->count = 0;
sc->initialized = true;
}
void sha512_2x64_update( sha512_2x64_context *sc, const void *data, size_t len )
@@ -1036,7 +1015,7 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sha512_2x64_round( sc, sc->buf, sc->val );
v128_block_bswap64( castp_v128u64( dst ), sc->val );

View File

@@ -8,6 +8,8 @@
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA512256D_2WAY 1
#endif
#if defined(SHA512256D_8WAY)
@@ -108,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
@@ -136,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, four );
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -145,6 +146,73 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA512256D_2WAY)
static void sha512256d_2x64_init( sha512_2x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = v128_64( 0x22312194FC2BF72C );
ctx->val[1] = v128_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v128_64( 0x2393B86B6F53B151 );
ctx->val[3] = v128_64( 0x963877195940EABD );
ctx->val[4] = v128_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v128_64( 0xBE5E1E2553863992 );
ctx->val[6] = v128_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v128_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
sha512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
sha512256d_2x64_init( &ctx );
sha512_2x64_update( &ctx, vdata, 80 );
sha512_2x64_close( &ctx, hash );
sha512256d_2x64_init( &ctx );
sha512_2x64_update( &ctx, hash, 32 );
sha512_2x64_close( &ctx, hash );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#else
#include "sph_sha2.h"
@@ -209,11 +277,13 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
bool register_sha512256d_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA512256D_8WAY)
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)
gate->scanhash = (void*)&scanhash_sha512256d_4way;
#elif defined(SHA512256D_2WAY)
gate->scanhash = (void*)&scanhash_sha512256d_2x64;
#else
gate->scanhash = (void*)&scanhash_sha512256d;
#endif

133
algo/sha/sph_sha1.h Normal file
View File

@@ -0,0 +1,133 @@
/* $Id: sph_sha1.h 216 2010-06-08 09:46:57Z tp $ */
/**
* SHA-1 interface.
*
* SHA-1 is described in FIPS 180-1 (now superseded by FIPS 180-2, but the
* description of SHA-1 is still included and has not changed). FIPS
* standards can be found at: http://csrc.nist.gov/publications/fips/
*
* @warning A theoretical collision attack against SHA-1, with work
* factor 2^63, has been published. SHA-1 should not be used in new
* protocol designs.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_sha1.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_SHA1_H__
#define SPH_SHA1_H__
#include <stddef.h>
#include "compat/sph_types.h"
/**
* Output size (in bits) for SHA-1.
*/
#define SPH_SIZE_sha1 160
/**
* This structure is a context for SHA-1 computations: it contains the
* intermediate values and some data from the last entered block. Once
* a SHA-1 computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running SHA-1 computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[5];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_sha1_context;
/**
* Initialize a SHA-1 context. This process performs no memory allocation.
*
* @param cc the SHA-1 context (pointer to a <code>sph_sha1_context</code>)
*/
void sph_sha1_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHA-1 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_sha1(void *cc, const void *data, size_t len);
/**
* Terminate the current SHA-1 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (20 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHA-1 context
* @param dst the destination buffer
*/
void sph_sha1_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (20 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHA-1 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_sha1_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
/**
* Apply the SHA-1 compression function on the provided data. The
* <code>msg</code> parameter contains the 16 32-bit input blocks,
* as numerical values (hence after the big-endian decoding). The
* <code>val</code> parameter contains the 5 32-bit input blocks for
* the compression function; the output is written in place in this
* array.
*
* @param msg the message block (16 values)
* @param val the function 160-bit input and output
*/
void sph_sha1_comp(const sph_u32 msg[16], sph_u32 val[5]);
void sph_sha1_full( void *hash, const void *msg, size_t len );
#endif

View File

@@ -34,8 +34,6 @@
#include <string.h>
#include "shabal-hash-4way.h"
//#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define DECL_STATE16 \
@@ -47,8 +45,6 @@
C8, C9, CA, CB, CC, CD, CE, CF; \
__m512i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m512i FIVE = v512_32( 5 ); \
const __m512i THREE = v512_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE16(state) do \
@@ -292,11 +288,21 @@ do { \
mm512_swap1024_512( BF, CF ); \
} while (0)
static inline __m512i v512_mult_x3( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 1 ) );
}
static inline __m512i v512_mult_x5( const __m512i x )
{
return _mm512_add_epi32( x, _mm512_slli_epi32( x, 2 ) );
}
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
_mm512_mullo_epi32( mm512_xor3( xa0, xc, \
_mm512_mullo_epi32( mm512_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
} while (0)
@@ -644,8 +650,6 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = v256_32( 5 ); \
const __m256i THREE = v256_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE8(state) do \
@@ -889,11 +893,21 @@ do { \
mm256_swap512_256( BF, CF ); \
} while (0)
static inline __m256i v256_mult_x3( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 1 ) );
}
static inline __m256i v256_mult_x5( const __m256i x )
{
return _mm256_add_epi32( x, _mm256_slli_epi32( x, 2 ) );
}
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
} while (0)
@@ -1226,15 +1240,13 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // AVX2
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
#define DECL_STATE \
v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
const v128u32_t FIVE = v128_32( 5 ); \
const v128u32_t THREE = v128_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE( state ) \
@@ -1479,12 +1491,22 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
v128_swap256_128( BF, CF ); \
}
static inline v128_t v128_mult_x3( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 1 ) );
}
static inline v128_t v128_mult_x5( const v128_t x )
{
return v128_add32( x, v128_sl32( x, 2 ) );
}
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
{ \
xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
v128_mul32( v128_xor3( xa0, xc, \
v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
v128_mult_x3( v128_xor3( xa0, xc, \
v128_mult_x5( v128_rol32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
}

View File

@@ -62,7 +62,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
#endif
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct {
v128_t buf[16] __attribute__ ((aligned (64)));

View File

@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
const v128_t zero = v128_zero;
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len )
{
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
v128_t *iv = (v128_t*)IV512;
h[0] = mm256_bcast_m128( iv[0] );
h[1] = mm256_bcast_m128( iv[1] );
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
}
casti_m256i( buf, 6 ) = mm256_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
count.u32[3] = ctx->count3;
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
}
casti_m512i( buf, 6 ) = mm512_bcast_m128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

View File

@@ -60,7 +60,6 @@ static const sph_u32 IV512[] = {
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
const v128_t zero = v128_zero;
v128_t p0, p1, p2, p3, x;
v128_t k00, k01, k02, k03, k10, k11, k12, k13;
v128_t *m = (v128_t*)msg;
@@ -76,39 +75,39 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = m[0];
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = m[1];
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = m[2];
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = m[3];
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = m[4];
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = m[5];
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = m[6];
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = m[7];
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
if ( r == 0 )
@@ -116,8 +115,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
if ( r == 1 )
@@ -125,32 +124,32 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
if ( r == 2 )
@@ -158,78 +157,78 @@ c512( sph_shavite_big_context *sc, const void *msg )
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
// round 2, 6, 10
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p3, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p1, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
// round 3, 7, 11
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p2, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p0, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
@@ -237,73 +236,73 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
}
// round 13
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, v128_xor( k11, v128_set32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );

File diff suppressed because it is too large Load Diff

View File

@@ -2,23 +2,68 @@
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#include "simd-utils.h"
#if defined(__SSE2__) || defined (__ARM_NEON)
typedef struct
{
uint32_t A[32];
uint8_t buffer[128];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_context __attribute__((aligned(64)));
// datalen is bytes
int simd512_ctx( simd512_context *ctx, void *hashval, const void *data,
int datalen );
int simd512( void *hashval, const void *data, int datalen );
#endif
#if defined(__AVX2__)
#include "simd-utils.h"
typedef struct
{
uint32_t A[ 32*2 ];
uint8_t buffer[ 128*2 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_2way_context __attribute__((aligned(128)));
#define simd_2way_context simd512_2way_context
// databitlen is bits
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_2way_ctx( simd512_2way_context *state, void *hashval,
const void *data, int datalen );
#define simd512_2way_full simd512_2way_ctx
int simd512_2way( void *hashval, const void *data, int datalen );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
typedef struct
{
uint32_t A[ 32*4 ];
uint8_t buffer[ 128*4 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_4way_context __attribute__((aligned(128)));
} simd512_4way_context __attribute__((aligned(128)));
#define simd_4way_context simd512_4way_context
int simd_4way_init( simd_4way_context *state, int hashbitlen );
int simd_4way_update( simd_4way_context *state, const void *data,
@@ -26,29 +71,12 @@ int simd_4way_update( simd_4way_context *state, const void *data,
int simd_4way_close( simd_4way_context *state, void *hashval );
int simd_4way_update_close( simd_4way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_4way_full( simd_4way_context *state, void *hashval,
int simd512_4way_ctx( simd_4way_context *state, void *hashval,
const void *data, int datalen );
#define simd512_4way_full simd512_4way_ctx
int simd512_4way( void *hashval, const void *data, int datalen );
#endif
typedef struct {
uint32_t A[ 32*2 ];
uint8_t buffer[ 128*2 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_2way_context __attribute__((aligned(128)));
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_2way_full( simd_2way_context *state, void *hashval,
const void *data, int datalen );
#endif
#endif

View File

@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
static __thread skein512_2x64_context skein512_2x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_2x64( void *state, const void *input )
{
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
uint32_t hash0[16] __attribute__ ((aligned (32)));
uint32_t hash1[16] __attribute__ ((aligned (32)));
skein512_2x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
dintrlv_2x64( hash0, hash1, vhash64, 512 );
sha256_full( hash0, hash0, 64 );
sha256_full( hash1, hash1, 64 );
intrlv_2x32( state, hash0, hash1, 256 );
}
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
uint32_t hash[8*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<1]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
v128u32_t *noncev = (v128u32_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skeinhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
{
extr_lane_2x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -3,16 +3,20 @@
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined(SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#elif defined(SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#elif defined(SKEIN_2WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein_2x64;
gate->hash = (void*)&skeinhash_2x64;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
gate->hash = (void*)&skein2hash_8way;
#elif defined (SKEIN_4WAY)
#elif defined(SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
#elif defined(SKEIN_2WAY)
gate->scanhash = (void*)&scanhash_skein2_2x64;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
return true;
};

View File

@@ -7,6 +7,8 @@
#define SKEIN_8WAY 1
#elif defined(__AVX2__)
#define SKEIN_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SKEIN_2WAY 1
#endif
#if defined(SKEIN_8WAY)
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#elif defined(SKEIN_2WAY)
void skeinhash_2x64( void *output, const void *input );
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void skein2hash_2x64( void *output, const void *input );
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
#else
void skeinhash( void *output, const void *input );

View File

@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_8WAY( et, ptr );
}
memset_zero_512( buf, buf_size >> 3 );
bcount = 0;
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
// Close
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
if ( ptr )
{
unsigned et;
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
}
memset_zero_256( buf, buf_size >> 3 );
bcount = 0;
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
// Close
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
if ( ptr )
{
unsigned et;
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_2WAY( et, ptr );
}
v128_memset_zero( buf, buf_size >> 3 );
bcount = 0;

View File

@@ -5,19 +5,6 @@
#if defined(SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx;
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
#elif defined(SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
uint64_t hash[16*4] __attribute__ ((aligned (64)));
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SKEIN_2WAY)
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
skein512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
skein512_2x64_prehash64( &ctx, vdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
skein512_2x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -1,369 +0,0 @@
#include "Swifftx_sha3.h"
extern "C" {
#include "SWIFFTX.h"
}
#include <math.h>
#include <stdlib.h>
#include <string.h>
// The default salt value.
// This is the expansion of e (Euler's number) - the 19 digits after 2.71:
// 8281828459045235360.
// The above in base 256, from MSB to LSB:
BitSequence SWIF_saltValueChar[SWIF_HAIFA_SALT_SIZE] = {114, 238, 247, 26, 192, 28, 170, 160};
// All the IVs here below were produced from the decimal digits of e's expansion.
// The code can be found in 'ProduceRandomIV.c'.
// The initial value for 224 digest size.
const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{37, 242, 132, 2, 167, 81, 158, 237, 113, 77, 162, 60, 65, 236, 108, 246,
101, 72, 190, 109, 58, 205, 99, 6, 114, 169, 104, 114, 38, 146, 121, 142,
59, 98, 233, 84, 72, 227, 22, 199, 17, 102, 198, 145, 24, 178, 37, 1,
215, 245, 66, 120, 230, 193, 113, 253, 165, 218, 66, 134, 49, 231, 124, 204,
0};
// The initial value for 256 digest size.
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{250, 50, 42, 40, 14, 233, 53, 48, 227, 42, 237, 187, 211, 120, 209, 234,
27, 144, 4, 61, 243, 244, 29, 247, 37, 162, 70, 11, 231, 196, 53, 6,
193, 240, 94, 126, 204, 132, 104, 46, 114, 29, 3, 104, 118, 184, 201, 3,
57, 77, 91, 101, 31, 155, 84, 199, 228, 39, 198, 42, 248, 198, 201, 178,
8};
// The initial value for 384 digest size.
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{40, 145, 193, 100, 205, 171, 47, 76, 254, 10, 196, 41, 165, 207, 200, 79,
109, 13, 75, 201, 17, 172, 64, 162, 217, 22, 88, 39, 51, 30, 220, 151,
133, 73, 216, 233, 184, 203, 77, 0, 248, 13, 28, 199, 30, 147, 232, 242,
227, 124, 169, 174, 14, 45, 27, 87, 254, 73, 68, 136, 135, 159, 83, 152,
0};
// The initial value for 512 digest size.
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE] =
{195, 126, 197, 167, 157, 114, 99, 126, 208, 105, 200, 90, 71, 195, 144, 138,
142, 122, 123, 116, 24, 214, 168, 173, 203, 183, 194, 210, 102, 117, 138, 42,
114, 118, 132, 33, 35, 149, 143, 163, 163, 183, 243, 175, 72, 22, 201, 255,
102, 243, 22, 187, 211, 167, 239, 76, 164, 70, 80, 182, 181, 212, 9, 185,
0};
///////////////////////////////////////////////////////////////////////////////////////////////
// NIST API implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
int Swifftx::Init(int hashbitlen)
{
switch(hashbitlen)
{
case 224:
swifftxState.hashbitlen = hashbitlen;
// Initializes h_0 in HAIFA:
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_224, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 256:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_256, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 384:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_384, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
case 512:
swifftxState.hashbitlen = hashbitlen;
memcpy(swifftxState.currOutputBlock, SWIF_HAIFA_IV_512, SWIFFTX_OUTPUT_BLOCK_SIZE);
break;
default:
return BAD_HASHBITLEN;
}
swifftxState.wasUpdated = false;
swifftxState.remainingSize = 0;
memset(swifftxState.remaining, 0, SWIF_HAIFA_INPUT_BLOCK_SIZE);
memset(swifftxState.numOfBitsChar, 0, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// Initialize the salt with the default value.
memcpy(swifftxState.salt, SWIF_saltValueChar, SWIF_HAIFA_SALT_SIZE);
InitializeSWIFFTX();
return SUCCESS;
}
int Swifftx::Update(const BitSequence *data, DataLength databitlen)
{
// The size of input in bytes after putting the remaining data from previous invocation.
int sizeOfInputAfterRemaining = 0;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// Whether we handled a single block.
bool wasSingleBlockHandled = false;
swifftxState.wasUpdated = true;
// Handle an empty message as required by NIST. Since 'Final()' is oblivious to the input
// (but of course uses the output of the compression function from the previous round,
// which is called h_{i-1} in HAIFA article), we have to do nothing here.
if (databitlen == 0)
return SUCCESS;
// If we had before an input with unaligned length, return an error
if (swifftxState.remainingSize % 8)
{
return INPUT_DATA_NOT_ALIGNED;
}
// Convert remaining size to bytes.
swifftxState.remainingSize /= 8;
// As long as we have enough data combined from (remaining + data) to fill input block
//NASTAVENIE RUND
while (((databitlen / 8) + swifftxState.remainingSize) >= SWIF_HAIFA_INPUT_BLOCK_SIZE)
{
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2. The input part of the block:
// 2a. The remaining data from the previous 'Update()' call:
if (swifftxState.remainingSize)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
swifftxState.remainingSize);
// 2b. The input data that we have place for after the 'remaining':
sizeOfInputAfterRemaining = SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- ((int) swifftxState.remainingSize) - SWIF_HAIFA_NUM_OF_BITS_SIZE
- SWIF_HAIFA_SALT_SIZE;
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize,
data, sizeOfInputAfterRemaining);
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + swifftxState.remainingSize
+ sizeOfInputAfterRemaining + SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt, SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, false);
// Update the #bits field with SWIF_HAIFA_INPUT_BLOCK_SIZE.
AddToCurrInBase256(swifftxState.numOfBitsChar, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
wasSingleBlockHandled = true;
data += sizeOfInputAfterRemaining;
databitlen -= (sizeOfInputAfterRemaining * 8);
swifftxState.remainingSize = 0;
}
// Update the swifftxState.remaining and swifftxState.remainingSize.
// remainingSize will be in bits after exiting 'Update()'.
if (wasSingleBlockHandled)
{
swifftxState.remainingSize = (unsigned int) databitlen; // now remaining size is in bits.
if (swifftxState.remainingSize)
memcpy(swifftxState.remaining, data, (swifftxState.remainingSize + 7) / 8);
}
else
{
memcpy(swifftxState.remaining + swifftxState.remainingSize, data,
(size_t) (databitlen + 7) / 8);
swifftxState.remainingSize = (swifftxState.remainingSize * 8) + (unsigned short) databitlen;
}
return SUCCESS;
}
int Swifftx::Final(BitSequence *hashval)
{
int i;
// Whether to add one last block. True if the padding appended to the last block overflows
// the block size.
bool toAddFinalBlock = false;
bool toPutOneInFinalBlock = false;
unsigned short oneShift = 0;
// The size of the last input block before the zeroes padding. We add 1 here because we
// include the final '1' bit in the calculation and 7 as we round the length to bytes.
unsigned short sizeOfLastInputBlock = (swifftxState.remainingSize + 1 + 7) / 8;
// The number of bytes of zero in the padding part.
// The padding contains:
// 1. A single 1 bit.
// 2. As many zeroes as needed.
// 3. The message length in bits. Occupies SWIF_HAIFA_NUM_OF_BITS_SIZE bytes.
// 4. The digest size. Maximum is 512, so we need 2 bytes.
// If the total number achieved is negative, add an additional block, as HAIFA specifies.
short numOfZeroBytesInPadding = (short) SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE
- sizeOfLastInputBlock - (2 * SWIF_HAIFA_NUM_OF_BITS_SIZE) - 2
- SWIF_HAIFA_SALT_SIZE;
// The input block to compression function of SWIFFTX:
BitSequence currInputBlock[SWIFFTX_INPUT_BLOCK_SIZE] = {0};
// The message length in base 256.
BitSequence messageLengthChar[SWIF_HAIFA_NUM_OF_BITS_SIZE] = {0};
// The digest size used for padding:
unsigned char digestSizeLSB = swifftxState.hashbitlen % 256;
unsigned char digestSizeMSB = (swifftxState.hashbitlen - digestSizeLSB) / 256;
if (numOfZeroBytesInPadding < 1)
toAddFinalBlock = true;
// Fill the input block with data:
// 1. The output of the previous block:
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. The input part of the block, which is the remaining data from the previous 'Update()'
// call, if exists and an extra '1' bit (maybe all we have is this extra 1):
// Add the last 1 in big-endian convention ...
if (swifftxState.remainingSize % 8 == 0)
{
swifftxState.remaining[sizeOfLastInputBlock - 1] = 0x80;
}
else
{
swifftxState.remaining[sizeOfLastInputBlock - 1] |= (1 << (7 - (swifftxState.remainingSize % 8)));
}
if (sizeOfLastInputBlock)
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE, swifftxState.remaining,
sizeOfLastInputBlock);
// Compute the message length in base 256:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
messageLengthChar[i] = swifftxState.numOfBitsChar[i];
if (sizeOfLastInputBlock)
AddToCurrInBase256(messageLengthChar, sizeOfLastInputBlock * 8);
if (!toAddFinalBlock)
{
// 2b. Put the zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, numOfZeroBytesInPadding);
// 2c. Pad the message length:
for (i = 0; i < SWIF_HAIFA_NUM_OF_BITS_SIZE; ++i)
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + i] = messageLengthChar[i];
// 2d. Pad the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock
+ numOfZeroBytesInPadding + SWIF_HAIFA_NUM_OF_BITS_SIZE + 1] = digestSizeLSB;
}
else
{
// 2b. Put the zeroes, if at all:
if ((SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock) > 0)
{
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + sizeOfLastInputBlock,
0, SWIF_HAIFA_INPUT_BLOCK_SIZE - sizeOfLastInputBlock);
}
}
// 3. The #bits part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
swifftxState.numOfBitsChar, SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, !toAddFinalBlock);
// If we have to add one more block, it is now:
if (toAddFinalBlock)
{
// 1. The previous output block, as usual.
memcpy(currInputBlock, swifftxState.currOutputBlock, SWIFFTX_OUTPUT_BLOCK_SIZE);
// 2a. Instead of the input, zeroes:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE , 0,
SWIF_HAIFA_INPUT_BLOCK_SIZE - SWIF_HAIFA_NUM_OF_BITS_SIZE - 2);
// 2b. Instead of the input, the message length:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
- SWIF_HAIFA_NUM_OF_BITS_SIZE - 2,
messageLengthChar,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 2c. Instead of the input, the digest size:
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 2] = digestSizeMSB;
currInputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE - 1] = digestSizeLSB;
// 3. The #bits part of the block, which is zero in case of additional block:
memset(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE,
0,
SWIF_HAIFA_NUM_OF_BITS_SIZE);
// 4. The salt part of the block:
memcpy(currInputBlock + SWIFFTX_OUTPUT_BLOCK_SIZE + SWIF_HAIFA_INPUT_BLOCK_SIZE
+ SWIF_HAIFA_NUM_OF_BITS_SIZE,
swifftxState.salt,
SWIF_HAIFA_SALT_SIZE);
ComputeSingleSWIFFTX(currInputBlock, swifftxState.currOutputBlock, true);
}
// Finally, copy the result into 'hashval'. In case the digest size is not 512bit, copy the
// first hashbitlen of them:
for (i = 0; i < (swifftxState.hashbitlen / 8); ++i)
hashval[i] = swifftxState.currOutputBlock[i];
return SUCCESS;
}
int Swifftx::Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval)
{
int result;
//hashState state;
// The pointer to the current place in the input we take into the compression function.
DataLength currInputIndex = 0;
result = Swifftx::Init(hashbitlen);
if (result != SUCCESS)
return result;
for ( ; (databitlen / 8) > SWIF_HAIFA_INPUT_BLOCK_SIZE;
currInputIndex += SWIF_HAIFA_INPUT_BLOCK_SIZE, databitlen -= (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8))
{
result = Swifftx::Update(data + currInputIndex, SWIF_HAIFA_INPUT_BLOCK_SIZE * 8);
if (result != SUCCESS)
return result;
}
// The length of the last block may be shorter than (SWIF_HAIFA_INPUT_BLOCK_SIZE * 8)
result = Swifftx::Update(data + currInputIndex, databitlen);
if (result != SUCCESS)
{
return result;
}
return Swifftx::Final(hashval);
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Helper fuction implementation portion.
///////////////////////////////////////////////////////////////////////////////////////////////
void Swifftx::AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE],
unsigned short toAdd)
{
unsigned char remainder = 0;
short i;
BitSequence currValueInBase256[8] = {0};
unsigned short currIndex = 7;
unsigned short temp = 0;
do
{
remainder = toAdd % 256;
currValueInBase256[currIndex--] = remainder;
toAdd -= remainder;
toAdd /= 256;
}
while(toAdd != 0);
for (i = 7; i >= 0; --i)
{
temp = value[i] + currValueInBase256[i];
if (temp > 255)
{
value[i] = temp % 256;
currValueInBase256[i - 1]++;
}
else
value[i] = (unsigned char) temp;
}
}

View File

@@ -1,79 +0,0 @@
#ifndef SWIFFTX_SHA3_H
#define SWIFFTX_SHA3_H
#include "sha3_interface.h"
#include "stdbool.h"
#include "stdint.h"
class Swifftx : public SHA3 {
#define SWIFFTX_INPUT_BLOCK_SIZE 256
#define SWIFFTX_OUTPUT_BLOCK_SIZE 65
#define SWIF_HAIFA_SALT_SIZE 8
#define SWIF_HAIFA_NUM_OF_BITS_SIZE 8
#define SWIF_HAIFA_INPUT_BLOCK_SIZE (SWIFFTX_INPUT_BLOCK_SIZE - SWIFFTX_OUTPUT_BLOCK_SIZE \
- SWIF_HAIFA_NUM_OF_BITS_SIZE - SWIF_HAIFA_SALT_SIZE)
typedef unsigned char BitSequence;
//const DataLength SWIF_SALT_VALUE;
#define SWIF_HAIFA_IV 0
/*const BitSequence SWIF_HAIFA_IV_224[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_256[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_384[SWIFFTX_OUTPUT_BLOCK_SIZE];
const BitSequence SWIF_HAIFA_IV_512[SWIFFTX_OUTPUT_BLOCK_SIZE];*/
typedef enum
{
SUCCESS = 0,
FAIL = 1,
BAD_HASHBITLEN = 2,
BAD_SALT_SIZE = 3,
SET_SALT_VALUE_FAILED = 4,
INPUT_DATA_NOT_ALIGNED = 5
} HashReturn;
typedef struct hashState {
unsigned short hashbitlen;
// The data remained after the recent call to 'Update()'.
BitSequence remaining[SWIF_HAIFA_INPUT_BLOCK_SIZE + 1];
// The size of the remaining data in bits.
// Is 0 in case there is no remaning data at all.
unsigned int remainingSize;
// The current output of the compression function. At the end will contain the final digest
// (which may be needed to be truncated, depending on hashbitlen).
BitSequence currOutputBlock[SWIFFTX_OUTPUT_BLOCK_SIZE];
// The value of '#bits hashed so far' field in HAIFA, in base 256.
BitSequence numOfBitsChar[SWIF_HAIFA_NUM_OF_BITS_SIZE];
// The salt value currently in use:
BitSequence salt[SWIF_HAIFA_SALT_SIZE];
// Indicates whether a single 'Update()' occured.
// Ater a call to 'Update()' the key and the salt values cannot be changed.
bool wasUpdated;
} hashState;
private:
int swifftxNumRounds;
hashState swifftxState;
public:
int Init(int hashbitlen);
int Update(const BitSequence *data, DataLength databitlen);
int Final(BitSequence *hashval);
int Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
BitSequence *hashval);
private:
static void AddToCurrInBase256(BitSequence value[SWIF_HAIFA_NUM_OF_BITS_SIZE], unsigned short toAdd);
};
#endif

View File

@@ -1,21 +0,0 @@
#pragma once
#include <cstdint>
namespace hash {
using BitSequence = unsigned char;
using DataLength = unsigned long long;
struct hash_interface {
virtual ~hash_interface() = default;
virtual int Init(int hash_bitsize) = 0;
virtual int Update(const BitSequence *data, DataLength data_bitsize) = 0;
virtual int Final(BitSequence *hash) = 0;
virtual int
Hash(int hash_bitsize, const BitSequence *data, DataLength data_bitsize, BitSequence *hash) = 0;
};
} // namespace hash

View File

@@ -1,14 +0,0 @@
#pragma once
#include <cstdint>
//#include <streams/hash/hash_interface.h>
#include "hash_interface.h"
namespace sha3 {
using BitSequence = hash::BitSequence;
using DataLength = hash::DataLength;
struct sha3_interface : hash::hash_interface {};
} // namespace sha3

View File

@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
*(__m256i*)hash, *(__m256i*)blob_off ), k );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
#define MULXOR \
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
/ VH_BYTE_ALIGNMENT ) + 1;
#if defined (__AVX2__)
const __m256i k = _mm256_set1_epi32( 0x1000193 );
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
const v128u32_t k = v128_32( 0x1000193 );
#endif

View File

@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined (X11GOST_2WAY)
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
union _x11gost_context_overlay
{
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
jh512_2x64_context jh;
keccak512_2x64_context keccak;
skein512_2x64_context skein;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_gost512_context gost;
};
typedef union _x11gost_context_overlay x11gost_context_overlay;
int x11gost_2x64_hash( void *state, const void *input, int thr_id )
{
uint8_t vhash[80*2] __attribute__((aligned(64)));
uint8_t hash0[64] __attribute__((aligned(64)));
uint8_t hash1[64] __attribute__((aligned(64)));
x11gost_context_overlay ctx;
intrlv_2x64( vhash, input, input+80, 640 );
blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
bmw512_2x64_init( &ctx.bmw );
bmw512_2x64_update( &ctx.bmw, vhash, 64 );
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash0, 64 );
sph_groestl512_close( &ctx.groestl, hash0 );
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash1, 64 );
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
intrlv_2x64( vhash, hash0, hash1, 512 );
skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash0, 64 );
sph_echo512_close( &ctx.echo, hash0 );
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash1, 64 );
sph_echo512_close( &ctx.echo, hash1 );
#endif
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
return 1;
}
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*2] __attribute__((aligned(64)));
uint32_t edata[20*2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
memcpy( edata+20, edata, 80 );
do
{
edata[19] = n;
edata[39] = n+1;
if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
{
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash, mythr );
}
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+1 );
submit_solution( work, hash+8, mythr );
}
}
n += 2;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
#endif

View File

@@ -2,20 +2,24 @@
bool register_x11gost_algo( algo_gate_t* gate )
{
#if defined (X11GOST_8WAY)
#if defined(X11GOST_8WAY)
init_x11gost_8way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_8way;
gate->hash = (void*)&x11gost_8way_hash;
#elif defined (X11GOST_4WAY)
#elif defined(X11GOST_4WAY)
init_x11gost_4way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_4way;
gate->hash = (void*)&x11gost_4way_hash;
#elif defined(X11GOST_2WAY)
gate->scanhash = (void*)&scanhash_x11gost_2x64;
gate->hash = (void*)&x11gost_2x64_hash;
#else
init_x11gost_ctx();
gate->scanhash = (void*)&scanhash_x11gost;
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -8,6 +8,8 @@
#define X11GOST_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X11GOST_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X11GOST_2WAY 1
#endif
bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x11gost_4way_ctx();
#elif defined(X11GOST_2WAY)
int x11gost_2x64_hash( void *state, const void *input, int thr_id );
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void x11gost_hash( void *state, const void *input );

Some files were not shown because too many files have changed in this diff Show More