From c65b0ff7a6d982b5faca465e01f27ac4af4901e2 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sat, 21 Dec 2019 13:19:29 -0500 Subject: [PATCH] v3.10.5 --- INSTALL_LINUX | 71 +- Makefile.am | 2 + RELEASE_NOTES | 18 +- algo/argon2/argon2d/blake2/blamka-round-opt.h | 8 +- algo/blake/blake-hash-4way.h | 17 +- algo/blake/blake256-hash-4way.c | 14 +- algo/bmw/bmw256-hash-4way.c | 52 + algo/cubehash/cube-hash-2way.c | 32 +- algo/cubehash/cubehash_sse2.c | 8 +- algo/hamsi/hamsi-hash-4way.c | 389 ++++- algo/hamsi/hamsi-hash-4way.h | 25 +- algo/haval/haval-8way-helper.c | 115 ++ algo/haval/haval-hash-4way.c | 297 +++- algo/haval/haval-hash-4way.h | 24 +- algo/lyra2/lyra2-gate.c | 28 +- algo/lyra2/lyra2-gate.h | 36 +- algo/lyra2/lyra2-hash-2way.c | 482 +++--- algo/lyra2/lyra2.c | 1 - algo/lyra2/lyra2.h | 11 + algo/lyra2/lyra2rev2-4way.c | 143 +- algo/lyra2/lyra2rev3-4way.c | 174 ++- algo/lyra2/sponge-2way.c | 178 ++- algo/lyra2/sponge.c | 56 +- algo/lyra2/sponge.h | 54 +- algo/quark/hmq1725-4way.c | 1081 +++++++++++-- algo/quark/hmq1725-gate.c | 7 +- algo/quark/hmq1725-gate.h | 14 +- algo/quark/hmq1725.c | 6 + algo/quark/quark-4way.c | 14 - algo/qubit/qubit-2way.c | 1 - algo/sha/sha-hash-4way.h | 40 +- algo/sha/sha512-hash-4way.c | 234 ++- algo/shabal/shabal-hash-4way.c | 646 +++++++- algo/shabal/shabal-hash-4way.h | 35 +- algo/shavite/shavite-hash-2way.c | 58 +- algo/x11/c11-4way.c | 23 +- algo/x11/x11-4way.c | 22 +- algo/x12/x12-4way.c | 282 +++- algo/x12/x12-gate.c | 8 +- algo/x12/x12-gate.h | 23 +- algo/x12/x12.c | 146 +- algo/x13/x13-4way.c | 261 +++- algo/x13/x13-gate.c | 8 +- algo/x13/x13-gate.h | 22 +- algo/x14/x14-4way.c | 329 +++- algo/x14/x14-gate.c | 8 +- algo/x14/x14-gate.h | 18 +- algo/x15/x15-4way.c | 366 ++++- algo/x15/x15-gate.c | 8 +- algo/x15/x15-gate.h | 20 +- algo/x16/x16r-4way.c | 423 +++++- algo/x16/x16r-gate.c | 37 +- algo/x16/x16r-gate.h | 47 +- algo/x16/x16rt-4way.c | 389 ++++- algo/x16/x16rv2-4way.c | 475 +++++- algo/x17/sonoa-4way.c | 1335 ++++++++++++++++- algo/x17/sonoa-gate.c | 8 +- algo/x17/sonoa-gate.h | 22 +- algo/x17/x17-4way.c | 307 +++- algo/x17/x17-gate.c | 7 +- algo/x17/x17-gate.h | 13 +- algo/x17/xevan-4way.c | 513 ++++++- algo/x17/xevan-gate.c | 6 +- algo/x17/xevan-gate.h | 17 +- build-allarch.sh | 2 + configure | 20 +- configure.ac | 2 +- cpu-miner.c | 50 +- simd-utils/intrlv.h | 452 +++++- simd-utils/simd-128.h | 90 +- simd-utils/simd-256.h | 116 +- simd-utils/simd-512.h | 180 +-- 72 files changed, 9090 insertions(+), 1336 deletions(-) create mode 100644 algo/haval/haval-8way-helper.c diff --git a/INSTALL_LINUX b/INSTALL_LINUX index e2a0953..a88f888 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -1,12 +1,14 @@ -Requirements: +1. Requirements: +--------------- Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not supported. 64 bit Linux operating system. Apple is not supported. -Building on linux prerequisites: +2. Building on linux prerequisites: +----------------------------------- It is assumed users know how to install packages on their system and be able to compile standard source packages. This is basic Linux and @@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu Install any additional dependencies needed by cpuminer-opt. The list below are some of the ones that may not be in the default install and need to -be installed manually. There may be others, read the error messages they -will give a clue as to the missing package. +be installed manually. There may be others, read the compiler error messages, +they will give a clue as to the missing package. The following command should install everything you need on Debian based distributions such as Ubuntu. Fedora and other distributions may have similar -but different package names. +but different package names. -sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev +$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and -openssl 1.1.0e or higher. Add one of the following, depending on the -compiler version, to CFLAGS: -"-march=native" or "-march=znver1" or "-msha". +openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA +support depending on your CPU and compiler version: + +"-march=native" is always the best choice + +"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000. + +"-msha" Add SHA to other tuning options Additional instructions for static compilalation can be found here: https://lxadm.com/Static_compilation_of_cpuminer Static builds should only considered in a homogeneous HW and SW environment. Local builds will always have the best performance and compatibility. -Extract cpuminer source. +3. Download cpuminer-opt +------------------------ -tar xvzf cpuminer-opt-x.y.z.tar.gz -cd cpuminer-opt-x.y.z +Download the source code for the latest realease from the official repository. -Run ./build.sh to build on Linux or execute the following commands. +https://github.com/JayDDee/cpuminer-opt/releases -./autogen.sh -CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -make +Extract the source code. -Start mining. +$ tar xvzf cpuminer-opt-x.y.z.tar.gz + + +Alternatively it can be cloned from git. + +$ git clone https://github.com/JayDDee/cpuminer-opt.git + +4. Build cpuminer-opt +--------------------- + +It is recomended to Build with default options, this will usuallly +produce the best results. + +$ ./build.sh to build on Linux or execute the following commands. + +or + +$ ./autogen.sh +$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl +$ make -j n + +n is the number of threads. + +5. Start mining. +---------------- + +$ ./cpuminer -a algo -o url -u username -p password -./cpuminer -a algo -o url -u username -p password Windows +------- + +See also INSTAL_WINDOWS + +The following procedure is obsolete and uses an old compiler. Precompiled Windows binaries are built on a Linux host using Mingw with a more recent compiler than the following Windows hosted procedure. diff --git a/Makefile.am b/Makefile.am index a2ba0fc..ee8990d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -124,6 +124,8 @@ cpuminer_SOURCES = \ algo/luffa/luffa-hash-2way.c \ algo/lyra2/lyra2.c \ algo/lyra2/sponge.c \ + algo/lyra2/sponge-2way.c \ + algo/lyra2/lyra2-hash-2way.c \ algo/lyra2/lyra2-gate.c \ algo/lyra2/lyra2rev2.c \ algo/lyra2/lyra2rev2-4way.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index e3c857b..8caedc5 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,6 +1,8 @@ cpuminer-opt is a console program run from the command line using the keyboard, not the mouse. +See also README.md for list of supported algorithms, + Security warning ---------------- @@ -31,7 +33,21 @@ not supported. FreeBSD YMMV. Change Log ---------- -v3.10.2 +v3.10.5 + +AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. +Faster hmq1725 AVX2. + +v3.10.4 + +AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil). + +v3.10.3 + +AVX512 for x12, x13, x14, x15. +Fixed x12 AVX2 invalid shares. + +v.10.2 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib). Fixed c11 AVX2 invalid shares. diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 37d99d8..8156331 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { #include -#define rotr32 mm256_swap32_64 -#define rotr24 mm256_ror3x8_64 -#define rotr16 mm256_ror1x16_64 -#define rotr63( x ) mm256_rol_64( x, 1 ) +#define rotr32( x ) mm256_ror_64( x, 32 ) +#define rotr24( x ) mm256_ror_64( x, 24 ) +#define rotr16( x ) mm256_ror_64( x, 16 ) +#define rotr63( x ) mm256_rol_64( x, 1 ) //#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) //#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 2cf9a47..9f389f6 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -70,19 +70,22 @@ typedef struct { // Default 14 rounds typedef blake_4way_small_context blake256_4way_context; void blake256_4way_init(void *ctx); -void blake256_4way(void *ctx, const void *data, size_t len); +void blake256_4way_update(void *ctx, const void *data, size_t len); +#define blake256_4way blake256_4way_update void blake256_4way_close(void *ctx, void *dst); // 14 rounds, blake, decred typedef blake_4way_small_context blake256r14_4way_context; void blake256r14_4way_init(void *cc); -void blake256r14_4way(void *cc, const void *data, size_t len); +void blake256r14_4way_update(void *cc, const void *data, size_t len); +#define blake256r14_4way blake256r14_4way_update void blake256r14_4way_close(void *cc, void *dst); // 8 rounds, blakecoin, vanilla typedef blake_4way_small_context blake256r8_4way_context; void blake256r8_4way_init(void *cc); -void blake256r8_4way(void *cc, const void *data, size_t len); +void blake256r8_4way_update(void *cc, const void *data, size_t len); +#define blake256r8_4way blake256r8_4way_update void blake256r8_4way_close(void *cc, void *dst); #ifdef __AVX2__ @@ -100,19 +103,21 @@ typedef struct { // Default 14 rounds typedef blake_8way_small_context blake256_8way_context; void blake256_8way_init(void *cc); -void blake256_8way(void *cc, const void *data, size_t len); +void blake256_8way_update(void *cc, const void *data, size_t len); +#define blake256_8way blake256_8way_update void blake256_8way_close(void *cc, void *dst); // 14 rounds, blake, decred typedef blake_8way_small_context blake256r14_8way_context; void blake256r14_8way_init(void *cc); -void blake256r14_8way(void *cc, const void *data, size_t len); +void blake256r14_8way_update(void *cc, const void *data, size_t len); void blake256r14_8way_close(void *cc, void *dst); // 8 rounds, blakecoin, vanilla typedef blake_8way_small_context blake256r8_8way_context; void blake256r8_8way_init(void *cc); -void blake256r8_8way(void *cc, const void *data, size_t len); +void blake256r8_8way_update(void *cc, const void *data, size_t len); +#define blake256r8_8way blake256r8_8way_update void blake256r8_8way_close(void *cc, void *dst); // Blake-512 4 way diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index 87592bc..f958659 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -634,7 +634,7 @@ do { \ m256_const1_64( 0x082EFA98082EFA98 ) ); \ VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ - shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ + shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \ M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \ @@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len) } void -blake256_16way_close_update(void *cc, void *dst) +blake256_16way_close(void *cc, void *dst) { blake32_16way_close(cc, 0, 0, dst, 8); } @@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc) } void -blake256_8way(void *cc, const void *data, size_t len) +blake256_8way_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } @@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc) } void -blake256r14_4way(void *cc, const void *data, size_t len) +blake256r14_4way_update(void *cc, const void *data, size_t len) { blake32_4way(cc, data, len); } @@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc) } void -blake256r14_8way(void *cc, const void *data, size_t len) +blake256r14_8way_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } @@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc) } void -blake256r8_4way(void *cc, const void *data, size_t len) +blake256r8_4way_update(void *cc, const void *data, size_t len) { blake32_4way(cc, data, len); } @@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc) } void -blake256r8_8way(void *cc, const void *data, size_t len) +blake256r8_8way_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index a5a2e77..92e7183 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], mm256_xor4( qt[24], qt[25], qt[26], qt[27] ), mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) ); +#define DH1L( m, sl, sr, a, b, c ) \ + _mm256_add_epi32( \ + _mm256_xor_si256( M[m], \ + _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \ + _mm256_srli_epi32( qt[a], sr ) ) ), \ + _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + +#define DH1R( m, sl, sr, a, b, c ) \ + _mm256_add_epi32( \ + _mm256_xor_si256( M[m], \ + _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \ + _mm256_slli_epi32( qt[a], sr ) ) ), \ + _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) ) + +#define DH2L( m, rl, sl, h, a, b, c ) \ + _mm256_add_epi32( _mm256_add_epi32( \ + mm256_rol_32( dH[h], rl ), \ + _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ + _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \ + _mm256_xor_si256( qt[b], qt[c] ) ) ); + +#define DH2R( m, rl, sr, h, a, b, c ) \ + _mm256_add_epi32( _mm256_add_epi32( \ + mm256_rol_32( dH[h], rl ), \ + _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \ + _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \ + _mm256_xor_si256( qt[b], qt[c] ) ) ); + + dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 ); + dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 ); + dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 ); + dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 ); + dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 ); + dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 ); + dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 ); + dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 ); + dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 ); + dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 ); + dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 ); + dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 ); + dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 ); + dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 ); + dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 ); + dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 ); + +#undef DH1L +#undef DH1R +#undef DH2L +#undef DH2R + +/* dH[ 0] = _mm256_add_epi32( _mm256_xor_si256( M[0], _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ), @@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16], _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )), _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ), _mm256_xor_si256( qt[22], qt[15] ) ) ); +*/ } static const __m256i final_s8[16] = diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index c9a4012..5a4af53 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp ) x1 = _mm512_xor_si512( x1, x5 ); x2 = _mm512_xor_si512( x2, x6 ); x3 = _mm512_xor_si512( x3, x7 ); - x4 = mm512_swap64_128( x4 ); - x5 = mm512_swap64_128( x5 ); - x6 = mm512_swap64_128( x6 ); - x7 = mm512_swap64_128( x7 ); + x4 = mm512_swap128_64( x4 ); + x5 = mm512_swap128_64( x5 ); + x6 = mm512_swap128_64( x6 ); + x7 = mm512_swap128_64( x7 ); x4 = _mm512_add_epi32( x0, x4 ); x5 = _mm512_add_epi32( x1, x5 ); x6 = _mm512_add_epi32( x2, x6 ); @@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp ) x1 = _mm512_xor_si512( x1, x5 ); x2 = _mm512_xor_si512( x2, x6 ); x3 = _mm512_xor_si512( x3, x7 ); - x4 = mm512_swap32_64( x4 ); - x5 = mm512_swap32_64( x5 ); - x6 = mm512_swap32_64( x6 ); - x7 = mm512_swap32_64( x7 ); + x4 = mm512_swap64_32( x4 ); + x5 = mm512_swap64_32( x5 ); + x6 = mm512_swap64_32( x6 ); + x7 = mm512_swap64_32( x7 ); } _mm512_store_si512( (__m512i*)sp->h, x0 ); @@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp ) x1 = _mm256_xor_si256( x1, x5 ); x2 = _mm256_xor_si256( x2, x6 ); x3 = _mm256_xor_si256( x3, x7 ); - x4 = mm256_swap64_128( x4 ); - x5 = mm256_swap64_128( x5 ); - x6 = mm256_swap64_128( x6 ); - x7 = mm256_swap64_128( x7 ); + x4 = mm256_swap128_64( x4 ); + x5 = mm256_swap128_64( x5 ); + x6 = mm256_swap128_64( x6 ); + x7 = mm256_swap128_64( x7 ); x4 = _mm256_add_epi32( x0, x4 ); x5 = _mm256_add_epi32( x1, x5 ); x6 = _mm256_add_epi32( x2, x6 ); @@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp ) x1 = _mm256_xor_si256( x1, x5 ); x2 = _mm256_xor_si256( x2, x6 ); x3 = _mm256_xor_si256( x3, x7 ); - x4 = mm256_swap32_64( x4 ); - x5 = mm256_swap32_64( x5 ); - x6 = mm256_swap32_64( x6 ); - x7 = mm256_swap32_64( x7 ); + x4 = mm256_swap64_32( x4 ); + x5 = mm256_swap64_32( x5 ); + x6 = mm256_swap64_32( x6 ); + x7 = mm256_swap64_32( x7 ); } _mm256_store_si256( (__m256i*)sp->h, x0 ); diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 7f6591f..8b9d010 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -39,8 +39,8 @@ static void transform( cubehashParam *sp ) x1 = mm256_rol_32( y0, 7 ); x0 = _mm256_xor_si256( x0, x2 ); x1 = _mm256_xor_si256( x1, x3 ); - x2 = mm256_swap64_128( x2 ); - x3 = mm256_swap64_128( x3 ); + x2 = mm256_swap128_64( x2 ); + x3 = mm256_swap128_64( x3 ); x2 = _mm256_add_epi32( x0, x2 ); x3 = _mm256_add_epi32( x1, x3 ); y0 = mm256_swap_128( x0 ); @@ -49,8 +49,8 @@ static void transform( cubehashParam *sp ) x1 = mm256_rol_32( y1, 11 ); x0 = _mm256_xor_si256( x0, x2 ); x1 = _mm256_xor_si256( x1, x3 ); - x2 = mm256_swap32_64( x2 ); - x3 = mm256_swap32_64( x3 ); + x2 = mm256_swap64_32( x2 ); + x3 = mm256_swap64_32( x3 ); } _mm256_store_si256( (__m256i*)sp->x, x0 ); diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 24d8ce8..0a1e6e2 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = { SPH_C32(0xe7e00a94) } }; +#define s0 m0 +#define s1 c0 +#define s2 m1 +#define s3 c1 +#define s4 c2 +#define s5 m2 +#define s6 c3 +#define s7 m3 +#define s8 m4 +#define s9 c4 +#define sA m5 +#define sB c5 +#define sC c6 +#define sD m6 +#define sE c7 +#define sF m7 + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// Hamsi 8 way + +#define INPUT_BIG8 \ +do { \ + __m512i db = *buf; \ + const uint64_t *tp = (uint64_t*)&T512[0][0]; \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ + for ( int u = 0; u < 64; u++ ) \ + { \ + __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \ + dm = mm512_negate_32( _mm512_or_si512( dm, \ + _mm512_slli_epi64( dm, 32 ) ) ); \ + m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \ + m512_const1_64( tp[0] ) ) ); \ + m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \ + m512_const1_64( tp[1] ) ) ); \ + m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \ + m512_const1_64( tp[2] ) ) ); \ + m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \ + m512_const1_64( tp[3] ) ) ); \ + m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \ + m512_const1_64( tp[4] ) ) ); \ + m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \ + m512_const1_64( tp[5] ) ) ); \ + m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \ + m512_const1_64( tp[6] ) ) ); \ + m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \ + m512_const1_64( tp[7] ) ) ); \ + tp += 8; \ + db = _mm512_srli_epi64( db, 1 ); \ + } \ +} while (0) + +#define SBOX8( a, b, c, d ) \ +do { \ + __m512i t; \ + t = a; \ + a = _mm512_and_si512( a, c ); \ + a = _mm512_xor_si512( a, d ); \ + c = _mm512_xor_si512( c, b ); \ + c = _mm512_xor_si512( c, a ); \ + d = _mm512_or_si512( d, t ); \ + d = _mm512_xor_si512( d, b ); \ + t = _mm512_xor_si512( t, c ); \ + b = d; \ + d = _mm512_or_si512( d, t ); \ + d = _mm512_xor_si512( d, a ); \ + a = _mm512_and_si512( a, b ); \ + t = _mm512_xor_si512( t, a ); \ + b = _mm512_xor_si512( b, d ); \ + b = _mm512_xor_si512( b, t ); \ + a = c; \ + c = b; \ + b = d; \ + d = mm512_not( t ); \ +} while (0) + +#define L8( a, b, c, d ) \ +do { \ + a = mm512_rol_32( a, 13 ); \ + c = mm512_rol_32( c, 3 ); \ + b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \ + d = _mm512_xor_si512( d, _mm512_xor_si512( c, \ + _mm512_slli_epi32( a, 3 ) ) ); \ + b = mm512_rol_32( b, 1 ); \ + d = mm512_rol_32( d, 7 ); \ + a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \ + c = _mm512_xor_si512( c, _mm512_xor_si512( d, \ + _mm512_slli_epi32( b, 7 ) ) ); \ + a = mm512_rol_32( a, 5 ); \ + c = mm512_rol_32( c, 22 ); \ +} while (0) + +#define DECL_STATE_BIG8 \ + __m512i c0, c1, c2, c3, c4, c5, c6, c7; \ + +#define READ_STATE_BIG8(sc) \ +do { \ + c0 = sc->h[0x0]; \ + c1 = sc->h[0x1]; \ + c2 = sc->h[0x2]; \ + c3 = sc->h[0x3]; \ + c4 = sc->h[0x4]; \ + c5 = sc->h[0x5]; \ + c6 = sc->h[0x6]; \ + c7 = sc->h[0x7]; \ +} while (0) + +#define WRITE_STATE_BIG8(sc) \ +do { \ + sc->h[0x0] = c0; \ + sc->h[0x1] = c1; \ + sc->h[0x2] = c2; \ + sc->h[0x3] = c3; \ + sc->h[0x4] = c4; \ + sc->h[0x5] = c5; \ + sc->h[0x6] = c6; \ + sc->h[0x7] = c7; \ +} while (0) + + +#define ROUND_BIG8(rc, alpha) \ +do { \ + __m512i t0, t1, t2, t3; \ + s0 = _mm512_xor_si512( s0, m512_const1_64( \ + ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ + s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ + s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ + s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ + s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ + s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ + s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ + s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ + s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ + s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ + sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ + sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ + sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ + sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ + sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ + sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ +\ + SBOX8( s0, s4, s8, sC ); \ + SBOX8( s1, s5, s9, sD ); \ + SBOX8( s2, s6, sA, sE ); \ + SBOX8( s3, s7, sB, sF ); \ +\ + t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \ + _mm512_bslli_epi128( s5, 4 ) ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \ + _mm512_bslli_epi128( sE, 4 ) ); \ + L8( s0, t1, s9, t3 ); \ + s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \ + sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \ +\ + t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ + _mm512_bslli_epi128( s6, 4 ) ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \ + _mm512_bslli_epi128( sF, 4 ) ); \ + L8( s1, t1, sA, t3 ); \ + s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ + s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \ + sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \ + sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \ +\ + t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \ + _mm512_bslli_epi128( s7, 4 ) ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \ + _mm512_bslli_epi128( sC, 4 ) ); \ + L8( s2, t1, sB, t3 ); \ + s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \ + s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \ + sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \ + sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \ +\ + t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \ + _mm512_bslli_epi128( s4, 4 ) ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \ + _mm512_bslli_epi128( sD, 4 ) ); \ + L8( s3, t1, s8, t3 ); \ + s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \ + s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \ + sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \ + sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \ +\ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \ + _mm512_bslli_epi128( sB, 4 ) ); \ + L8( t0, t1, t2, t3 ); \ + s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ + s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \ + s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ + s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ + s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \ + sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ + s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \ + sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \ +\ + t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \ + _mm512_bslli_epi128( sD, 4 ) ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \ + t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \ + L8( t0, t1, t2, t3 ); \ + s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \ + sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \ + s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \ + sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \ + s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \ + sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \ + s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \ + sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ +} while (0) + +#define P_BIG8 \ +do { \ + ROUND_BIG8(0, alpha_n); \ + ROUND_BIG8(1, alpha_n); \ + ROUND_BIG8(2, alpha_n); \ + ROUND_BIG8(3, alpha_n); \ + ROUND_BIG8(4, alpha_n); \ + ROUND_BIG8(5, alpha_n); \ +} while (0) + +#define PF_BIG8 \ +do { \ + ROUND_BIG8( 0, alpha_f); \ + ROUND_BIG8( 1, alpha_f); \ + ROUND_BIG8( 2, alpha_f); \ + ROUND_BIG8( 3, alpha_f); \ + ROUND_BIG8( 4, alpha_f); \ + ROUND_BIG8( 5, alpha_f); \ + ROUND_BIG8( 6, alpha_f); \ + ROUND_BIG8( 7, alpha_f); \ + ROUND_BIG8( 8, alpha_f); \ + ROUND_BIG8( 9, alpha_f); \ + ROUND_BIG8(10, alpha_f); \ + ROUND_BIG8(11, alpha_f); \ +} while (0) + +#define T_BIG8 \ +do { /* order is important */ \ + c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \ + c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \ + c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \ + c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \ + c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \ + c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \ + c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \ + c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \ +} while (0) + +void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) +{ + DECL_STATE_BIG8 + uint32_t tmp = num << 6; + + sc->count_low = SPH_T32( sc->count_low + tmp ); + sc->count_high += (sph_u32)( (num >> 13) >> 13 ); + if ( sc->count_low < tmp ) + sc->count_high++; + + READ_STATE_BIG8( sc ); + while ( num-- > 0 ) + { + __m512i m0, m1, m2, m3, m4, m5, m6, m7; + + INPUT_BIG8; + P_BIG8; + T_BIG8; + buf++; + } + WRITE_STATE_BIG8( sc ); +} + +void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) +{ + __m512i m0, m1, m2, m3, m4, m5, m6, m7; + DECL_STATE_BIG8 + READ_STATE_BIG8( sc ); + INPUT_BIG8; + PF_BIG8; + T_BIG8; + WRITE_STATE_BIG8( sc ); +} + + +void hamsi512_8way_init( hamsi_8way_big_context *sc ) +{ + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + + sc->h[0] = m512_const1_64( 0x6c70617273746565 ); + sc->h[1] = m512_const1_64( 0x656e62656b204172 ); + sc->h[2] = m512_const1_64( 0x302c206272672031 ); + sc->h[3] = m512_const1_64( 0x3434362c75732032 ); + sc->h[4] = m512_const1_64( 0x3030312020422d33 ); + sc->h[5] = m512_const1_64( 0x656e2d484c657576 ); + sc->h[6] = m512_const1_64( 0x6c65652c65766572 ); + sc->h[7] = m512_const1_64( 0x6769756d2042656c ); +} + +void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data, + size_t len ) +{ + __m512i *vdata = (__m512i*)data; + + hamsi_8way_big( sc, vdata, len>>3 ); + vdata += ( (len& ~(size_t)7) >> 3 ); + len &= (size_t)7; + memcpy_512( sc->buf, vdata, len>>3 ); + sc->partial_len = len; +} + +void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) +{ + __m512i pad[1]; + int ch, cl; + + sph_enc32be( &ch, sc->count_high ); + sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch, + cl, ch, cl, ch, cl, ch, cl, ch ); +// pad[0] = m512_const2_32( cl, ch ); + sc->buf[0] = m512_const1_64( 0x80 ); + hamsi_8way_big( sc, sc->buf, 1 ); + hamsi_8way_big_final( sc, pad ); + + mm512_block_bswap_32( (__m512i*)dst, sc->h ); +} + + +#endif // AVX512 + + +// Hamsi 4 way #define INPUT_BIG \ do { \ @@ -627,6 +967,7 @@ do { \ sc->h[0x7] = c7; \ } while (0) +/* #define s0 m0 #define s1 c0 #define s2 m1 @@ -643,42 +984,28 @@ do { \ #define sD m6 #define sE c7 #define sF m7 +*/ #define ROUND_BIG(rc, alpha) \ do { \ __m256i t0, t1, t2, t3; \ s0 = _mm256_xor_si256( s0, m256_const1_64( \ - ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \ - s1 = _mm256_xor_si256( s1, m256_const1_64( \ - ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \ - s2 = _mm256_xor_si256( s2, m256_const1_64( \ - ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \ - s3 = _mm256_xor_si256( s3, m256_const1_64( \ - ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \ - s4 = _mm256_xor_si256( s4, m256_const1_64( \ - ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \ - s5 = _mm256_xor_si256( s5, m256_const1_64( \ - ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \ - s6 = _mm256_xor_si256( s6, m256_const1_64( \ - ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \ - s7 = _mm256_xor_si256( s7, m256_const1_64( \ - ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \ - s8 = _mm256_xor_si256( s8, m256_const1_64( \ - ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \ - s9 = _mm256_xor_si256( s9, m256_const1_64( \ - ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \ - sA = _mm256_xor_si256( sA, m256_const1_64( \ - ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \ - sB = _mm256_xor_si256( sB, m256_const1_64( \ - ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \ - sC = _mm256_xor_si256( sC, m256_const1_64( \ - ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \ - sD = _mm256_xor_si256( sD, m256_const1_64( \ - ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \ - sE = _mm256_xor_si256( sE, m256_const1_64( \ - ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \ - sF = _mm256_xor_si256( sF, m256_const1_64( \ - ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \ + ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \ + s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \ + s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \ + s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \ + s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \ + s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \ + s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \ + s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \ + s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \ + s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \ + sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \ + sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \ + sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \ + sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \ + sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \ + sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \ \ SBOX( s0, s4, s8, sC ); \ SBOX( s1, s5, s9, sD ); \ diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index f70f3fe..4e57f10 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -60,9 +60,32 @@ typedef struct { typedef hamsi_4way_big_context hamsi512_4way_context; void hamsi512_4way_init( hamsi512_4way_context *sc ); -void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len ); +void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data, + size_t len ); +#define hamsi512_4way hamsi512_4way_update void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +typedef struct { + __m512i h[8]; + __m512i buf[1]; + size_t partial_len; + sph_u32 count_high, count_low; +} hamsi_8way_big_context; + +typedef hamsi_8way_big_context hamsi512_8way_context; + +void hamsi512_8way_init( hamsi512_8way_context *sc ); +void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data, + size_t len ); +void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst ); + + + +#endif + + #ifdef __cplusplus } #endif diff --git a/algo/haval/haval-8way-helper.c b/algo/haval/haval-8way-helper.c new file mode 100644 index 0000000..82187f5 --- /dev/null +++ b/algo/haval/haval-8way-helper.c @@ -0,0 +1,115 @@ +/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */ +/* + * Helper code, included (three times !) by HAVAL implementation. + * + * TODO: try to merge this with md_helper.c. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#undef SPH_XCAT +#define SPH_XCAT(a, b) SPH_XCAT_(a, b) +#undef SPH_XCAT_ +#define SPH_XCAT_(a, b) a ## b + +static void +SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update) +( haval_8way_context *sc, const void *data, size_t len ) +{ + __m256i *vdata = (__m256i*)data; + unsigned current; + + current = (unsigned)sc->count_low & 127U; + while ( len > 0 ) + { + unsigned clen; + uint32_t clow, clow2; + + clen = 128U - current; + if ( clen > len ) + clen = len; + memcpy_256( sc->buf + (current>>2), vdata, clen>>2 ); + vdata += clen>>2; + current += clen; + len -= clen; + if ( current == 128U ) + { + DSTATE_8W; + IN_PREPARE_8W(sc->buf); + RSTATE_8W; + SPH_XCAT(CORE_8W, PASSES)(INW_8W); + WSTATE_8W; + current = 0; + } + clow = sc->count_low; + clow2 = clow + clen; + sc->count_low = clow2; + if ( clow2 < clow ) + sc->count_high ++; + } +} + +static void +SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc, + void *dst) +{ + unsigned current; + DSTATE_8W; + + current = (unsigned)sc->count_low & 127UL; + + sc->buf[ current>>2 ] = m256_one_32; + current += 4; + RSTATE_8W; + if ( current > 116UL ) + { + memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 ); + do + { + IN_PREPARE_8W(sc->buf); + SPH_XCAT(CORE_8W, PASSES)(INW_8W); + } while (0); + current = 0; + } + + uint32_t t1, t2; + memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 ); + t1 = 0x01 | (PASSES << 3); + t2 = sc->olen << 3; + sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) ); + sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 ); + sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3) + | (sc->count_low >> 29) ); + do + { + IN_PREPARE_8W(sc->buf); + SPH_XCAT(CORE_8W, PASSES)(INW_8W); + } while (0); + WSTATE_8W; + haval_8way_out( sc, dst ); +} diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index 35cfd17..02df40f 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -40,7 +40,7 @@ #include #include "haval-hash-4way.h" -// won't compile with sse4.2 +// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way. //#if defined (__SSE4_2__) #if defined(__AVX__) @@ -518,6 +518,301 @@ do { \ #define INMSG(i) msg[i] +#if defined(__AVX2__) + +// Haval-256 8 way 32 bit avx2 + +#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \ + _mm256_xor_si256( x0, \ + _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \ + _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \ + _mm256_and_si256( x3, x6 ) ) ) ) \ + +#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \ + _mm256_xor_si256( \ + _mm256_and_si256( x2, \ + _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \ + _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \ + _mm256_xor_si256( x6, x0 ) ) ) ), \ + _mm256_xor_si256( \ + _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \ + _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \ + +#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \ + _mm256_xor_si256( \ + _mm256_and_si256( x3, \ + _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ + _mm256_xor_si256( x6, x0 ) ) ), \ + _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ) ), x0 ) ) + +#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \ + _mm256_xor_si256( \ + _mm256_xor_si256( \ + _mm256_and_si256( x3, \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \ + _mm256_or_si256( x4, x6 ) ), x5 ) ), \ + _mm256_and_si256( x4, \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \ + _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \ + _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) ) + + +#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \ + _mm256_xor_si256( \ + _mm256_and_si256( x0, \ + mm256_not( _mm256_xor_si256( \ + _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \ + _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \ + _mm256_and_si256( x2, x5 ) ), \ + _mm256_and_si256( x3, x6 ) ) ) + +#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \ + F1_8W(x1, x0, x3, x5, x6, x2, x4) +#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \ + F2_8W(x4, x2, x1, x0, x5, x3, x6) +#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \ + F3_8W(x6, x1, x2, x3, x4, x5, x0) + +#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \ + F1_8W(x2, x6, x1, x4, x5, x3, x0) +#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \ + F2_8W(x3, x5, x2, x0, x1, x6, x4) +#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \ + F3_8W(x1, x4, x3, x6, x0, x2, x5) +#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \ + F4_8W(x6, x4, x0, x5, x2, x1, x3) + +#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \ + F1_8W(x3, x4, x1, x0, x5, x2, x6) +#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \ + F2_8W(x6, x2, x1, x0, x3, x4, x5) +#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \ + F3_8W(x2, x6, x0, x4, x3, x1, x5) +#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \ + F4_8W(x1, x5, x3, x2, x0, x4, x6) +#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \ + F5_8W(x2, x5, x0, x6, x4, x3, x1) + +#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \ +do { \ + __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \ + x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \ + mm256_ror_32( x7, 11 ) ), \ + _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \ +} while (0) + +#define PASS1_8W(n, in) do { \ + unsigned pass_count; \ + for (pass_count = 0; pass_count < 32; pass_count += 8) { \ + STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(pass_count + 0), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(pass_count + 1), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(pass_count + 2), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(pass_count + 3), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(pass_count + 4), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(pass_count + 5), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(pass_count + 6), SPH_C32(0x00000000)); \ + STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(pass_count + 7), SPH_C32(0x00000000)); \ + } \ + } while (0) + +#define PASSG_8W(p, n, in) do { \ + unsigned pass_count; \ + for (pass_count = 0; pass_count < 32; pass_count += 8) { \ + STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(MP ## p[pass_count + 0]), \ + RK ## p[pass_count + 0]); \ + STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(MP ## p[pass_count + 1]), \ + RK ## p[pass_count + 1]); \ + STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(MP ## p[pass_count + 2]), \ + RK ## p[pass_count + 2]); \ + STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(MP ## p[pass_count + 3]), \ + RK ## p[pass_count + 3]); \ + STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(MP ## p[pass_count + 4]), \ + RK ## p[pass_count + 4]); \ + STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(MP ## p[pass_count + 5]), \ + RK ## p[pass_count + 5]); \ + STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(MP ## p[pass_count + 6]), \ + RK ## p[pass_count + 6]); \ + STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(MP ## p[pass_count + 7]), \ + RK ## p[pass_count + 7]); \ + } \ + } while (0) + +#define PASS2_8W(n, in) PASSG_8W(2, n, in) +#define PASS3_8W(n, in) PASSG_8W(3, n, in) +#define PASS4_8W(n, in) PASSG_8W(4, n, in) +#define PASS5_8W(n, in) PASSG_8W(5, n, in) + +#define SAVE_STATE_8W \ + __m256i u0, u1, u2, u3, u4, u5, u6, u7; \ + do { \ + u0 = s0; \ + u1 = s1; \ + u2 = s2; \ + u3 = s3; \ + u4 = s4; \ + u5 = s5; \ + u6 = s6; \ + u7 = s7; \ + } while (0) + +#define UPDATE_STATE_8W \ +do { \ + s0 = _mm256_add_epi32( s0, u0 ); \ + s1 = _mm256_add_epi32( s1, u1 ); \ + s2 = _mm256_add_epi32( s2, u2 ); \ + s3 = _mm256_add_epi32( s3, u3 ); \ + s4 = _mm256_add_epi32( s4, u4 ); \ + s5 = _mm256_add_epi32( s5, u5 ); \ + s6 = _mm256_add_epi32( s6, u6 ); \ + s7 = _mm256_add_epi32( s7, u7 ); \ +} while (0) + +#define CORE_8W5(in) do { \ + SAVE_STATE_8W; \ + PASS1_8W(5, in); \ + PASS2_8W(5, in); \ + PASS3_8W(5, in); \ + PASS4_8W(5, in); \ + PASS5_8W(5, in); \ + UPDATE_STATE_8W; \ + } while (0) + +#define DSTATE_8W __m256i s0, s1, s2, s3, s4, s5, s6, s7 + +#define RSTATE_8W \ +do { \ + s0 = sc->s0; \ + s1 = sc->s1; \ + s2 = sc->s2; \ + s3 = sc->s3; \ + s4 = sc->s4; \ + s5 = sc->s5; \ + s6 = sc->s6; \ + s7 = sc->s7; \ +} while (0) + +#define WSTATE_8W \ +do { \ + sc->s0 = s0; \ + sc->s1 = s1; \ + sc->s2 = s2; \ + sc->s3 = s3; \ + sc->s4 = s4; \ + sc->s5 = s5; \ + sc->s6 = s6; \ + sc->s7 = s7; \ +} while (0) + +static void +haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes ) +{ + sc->s0 = m256_const1_32( 0x243F6A88UL ); + sc->s1 = m256_const1_32( 0x85A308D3UL ); + sc->s2 = m256_const1_32( 0x13198A2EUL ); + sc->s3 = m256_const1_32( 0x03707344UL ); + sc->s4 = m256_const1_32( 0xA4093822UL ); + sc->s5 = m256_const1_32( 0x299F31D0UL ); + sc->s6 = m256_const1_32( 0x082EFA98UL ); + sc->s7 = m256_const1_32( 0xEC4E6C89UL ); + sc->olen = olen; + sc->passes = passes; + sc->count_high = 0; + sc->count_low = 0; + +} +#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata) + +#define INW_8W(i) load_ptr_8w[ i ] + +static void +haval_8way_out( haval_8way_context *sc, void *dst ) +{ + __m256i *buf = (__m256i*)dst; + DSTATE_8W; + RSTATE_8W; + + buf[0] = s0; + buf[1] = s1; + buf[2] = s2; + buf[3] = s3; + buf[4] = s4; + buf[5] = s5; + buf[6] = s6; + buf[7] = s7; +} + +#undef PASSES +#define PASSES 5 +#include "haval-8way-helper.c" + +#define API_8W(xxx, y) \ +void \ +haval ## xxx ## _ ## y ## _8way_init(void *cc) \ +{ \ + haval_8way_init(cc, xxx >> 5, y); \ +} \ + \ +void \ +haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \ +{ \ + haval ## y ## _8way_update(cc, data, len); \ +} \ + \ +void \ +haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \ +{ \ + haval ## y ## _8way_close(cc, dst); \ +} \ + +API_8W(256, 5) + +#define RVAL_8W \ +do { \ + s0 = val[0]; \ + s1 = val[1]; \ + s2 = val[2]; \ + s3 = val[3]; \ + s4 = val[4]; \ + s5 = val[5]; \ + s6 = val[6]; \ + s7 = val[7]; \ +} while (0) + +#define WVAL_8W \ +do { \ + val[0] = s0; \ + val[1] = s1; \ + val[2] = s2; \ + val[3] = s3; \ + val[4] = s4; \ + val[5] = s5; \ + val[6] = s6; \ + val[7] = s7; \ +} while (0) + +#define INMSG_8W(i) msg[i] + + + +#endif // AVX2 + #ifdef __cplusplus } #endif diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index 47338ce..9bd37ba 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -59,7 +59,7 @@ */ #ifndef HAVAL_HASH_4WAY_H__ -#define HAVAL_HASH_4WAY_H__ +#define HAVAL_HASH_4WAY_H__ 1 #if defined(__AVX__) @@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context; void haval256_5_4way_init( void *cc ); -void haval256_5_4way( void *cc, const void *data, size_t len ); +void haval256_5_4way_update( void *cc, const void *data, size_t len ); +#define haval256_5_4way haval256_5_4way_update void haval256_5_4way_close( void *cc, void *dst ); +#if defined(__AVX2__) + +typedef struct { + __m256i buf[32]; + __m256i s0, s1, s2, s3, s4, s5, s6, s7; + unsigned olen, passes; + uint32_t count_high, count_low; +} haval_8way_context __attribute__ ((aligned (64))); + +typedef haval_8way_context haval256_5_8way_context; + +void haval256_5_8way_init( void *cc ); + +void haval256_5_8way_update( void *cc, const void *data, size_t len ); + +void haval256_5_8way_close( void *cc, void *dst ); + +#endif // AVX2 + #ifdef __cplusplus } #endif diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index b608ba2..4b1f7e6 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -44,8 +44,13 @@ bool lyra2rev3_thread_init() { const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + int size = ROW_LEN_BYTES * 4; // nRows; - int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; +#if defined(LYRA2REV3_16WAY) +// l2v3_wholeMatrix = _mm_malloc( 2*size, 128 ); + l2v3_wholeMatrix = _mm_malloc( 2*size, 64 ); + init_lyra2rev3_16way_ctx();; +#else l2v3_wholeMatrix = _mm_malloc( size, 64 ); #if defined (LYRA2REV3_8WAY) init_lyra2rev3_8way_ctx();; @@ -53,13 +58,17 @@ bool lyra2rev3_thread_init() init_lyra2rev3_4way_ctx();; #else init_lyra2rev3_ctx(); +#endif #endif return l2v3_wholeMatrix; } bool register_lyra2rev3_algo( algo_gate_t* gate ) { -#if defined (LYRA2REV3_8WAY) +#if defined(LYRA2REV3_16WAY) + gate->scanhash = (void*)&scanhash_lyra2rev3_16way; + gate->hash = (void*)&lyra2rev3_16way_hash; +#elif defined (LYRA2REV3_8WAY) gate->scanhash = (void*)&scanhash_lyra2rev3_8way; gate->hash = (void*)&lyra2rev3_8way_hash; #elif defined (LYRA2REV3_4WAY) @@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2rev3; gate->hash = (void*)&lyra2rev3_hash; #endif - gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; gate->miner_thread_init = (void*)&lyra2rev3_thread_init; opt_target_factor = 256.0; return true; @@ -85,10 +94,14 @@ bool lyra2rev2_thread_init() const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; int size = (int64_t)ROW_LEN_BYTES * 4; // nRows; +#if defined (LYRA2REV2_8WAY) + l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 ); // 2 way + init_lyra2rev2_8way_ctx();; +#elif defined (LYRA2REV2_4WAY) l2v2_wholeMatrix = _mm_malloc( size, 64 ); -#if defined (LYRA2REV2_4WAY) init_lyra2rev2_4way_ctx();; #else + l2v2_wholeMatrix = _mm_malloc( size, 64 ); init_lyra2rev2_ctx(); #endif return l2v2_wholeMatrix; @@ -96,14 +109,17 @@ bool lyra2rev2_thread_init() bool register_lyra2rev2_algo( algo_gate_t* gate ) { -#if defined (LYRA2REV2_4WAY) +#if defined (LYRA2REV2_8WAY) + gate->scanhash = (void*)&scanhash_lyra2rev2_8way; + gate->hash = (void*)&lyra2rev2_8way_hash; +#elif defined (LYRA2REV2_4WAY) gate->scanhash = (void*)&scanhash_lyra2rev2_4way; gate->hash = (void*)&lyra2rev2_4way_hash; #else gate->scanhash = (void*)&scanhash_lyra2rev2; gate->hash = (void*)&lyra2rev2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; gate->miner_thread_init = (void*)&lyra2rev2_thread_init; opt_target_factor = 256.0; return true; diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index 8a392ca..5c48bdc 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -5,18 +5,27 @@ #include #include "lyra2.h" -#if defined(__AVX2__) - #define LYRA2REV3_8WAY -#endif -#if defined(__SSE2__) - #define LYRA2REV3_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define LYRA2REV3_16WAY 1 +#elif defined(__AVX2__) + #define LYRA2REV3_8WAY 1 +#elif defined(__SSE2__) + #define LYRA2REV3_4WAY 1 #endif extern __thread uint64_t* l2v3_wholeMatrix; bool register_lyra2rev3_algo( algo_gate_t* gate ); -#if defined(LYRA2REV3_8WAY) + +#if defined(LYRA2REV3_16WAY) + +void lyra2rev3_16way_hash( void *state, const void *input ); +int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool init_lyra2rev3_16way_ctx(); + +#elif defined(LYRA2REV3_8WAY) void lyra2rev3_8way_hash( void *state, const void *input ); int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce, @@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx(); ////////////////////////////////// -#if defined(__AVX2__) - #define LYRA2REV2_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define LYRA2REV2_8WAY 1 +#elif defined(__AVX2__) + #define LYRA2REV2_4WAY 1 #endif extern __thread uint64_t* l2v2_wholeMatrix; bool register_lyra2rev2_algo( algo_gate_t* gate ); -#if defined(LYRA2REV2_4WAY) +#if defined(LYRA2REV2_8WAY) + +void lyra2rev2_8way_hash( void *state, const void *input ); +int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool init_lyra2rev2_8way_ctx(); + +#elif defined(LYRA2REV2_4WAY) void lyra2rev2_4way_hash( void *state, const void *input ); int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/lyra2/lyra2-hash-2way.c b/algo/lyra2/lyra2-hash-2way.c index b657af0..b69eb09 100644 --- a/algo/lyra2/lyra2-hash-2way.c +++ b/algo/lyra2/lyra2-hash-2way.c @@ -26,6 +26,19 @@ #include "lyra2.h" #include "sponge.h" +// LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x. +// +// LYRA2REV2 4 cols 4 rows used by lyra2rev2. +// +// LYRA2REV3 4 cols 4 rows with an extra twist in calculating +// rowa in the wandering phase. Used by lyra2rev3. +// +// LYRA2Z various cols & rows and supports 80 input. Used by lyra2z, +// lyra2z330, lyra2h, + + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + /** * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, @@ -46,176 +59,137 @@ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) */ -int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, - const uint64_t pwdlen, const void *salt, const uint64_t saltlen, - const uint64_t timeCost, const uint64_t nRows, - const uint64_t nCols ) +// For lyra2rev3. +// convert a simple offset to an index into interleaved data. +// good for state and 4 row matrix. +// index = ( int( off / 4 ) * 2 ) + ( off mod 4 ) + +#define offset_to_index( o ) \ + ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) ) + + +int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, + const void *pwd, const uint64_t pwdlen, const uint64_t timeCost, + const uint64_t nRows, const uint64_t nCols ) { //====================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 -// int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy + uint64_t _ALIGN(256) state[32]; + int64_t row = 2; + int64_t prev = 1; + int64_t rowa0 = 0; + int64_t rowa1 = 0; + int64_t tau; + int64_t step = 1; + int64_t window = 2; + int64_t gap = 1; //====================================================================/ - //=== Initializing the Memory Matrix and pointers to it =============// - //Tries to allocate enough space for the whole memory matrix - const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; -// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; uint64_t *ptrWord = wholeMatrix; -// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); - - //=== Getting the password + salt + basil padded with 10*1 ==========// - //OBS.:The memory matrix will temporarily hold the password: not for saving memory, - //but this ensures that the password copied locally will be overwritten as soon as possible - - //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) + int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; - byte *ptrByte = (byte*) wholeMatrix; + uint64_t *ptr = wholeMatrix; + uint64_t *pw = (uint64_t*)pwd; - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; + memcpy( ptr, pw, 2*pwdlen ); // password + ptr += pwdlen>>2; + memcpy( ptr, pw, 2*pwdlen ); // password lane 1 + ptr += pwdlen>>2; - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; + // now build the rest interleaving on the fly. - memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - - (saltlen + pwdlen) ); + ptr[0] = ptr[ 4] = kLen; + ptr[1] = ptr[ 5] = pwdlen; + ptr[2] = ptr[ 6] = pwdlen; // saltlen + ptr[3] = ptr[ 7] = timeCost; + ptr[8] = ptr[12] = nRows; + ptr[9] = ptr[13] = nCols; + ptr[10] = ptr[14] = 0x80; + ptr[11] = ptr[15] = 0x0100000000000000; - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - -// from here on it's all simd acces to state and matrix -// define vector pointers and adjust sizes and pointer offsets - - //================= Initializing the Sponge State ====================// - //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - -// initState( state ); - - //========================= Setup Phase =============================// - //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - ptrWord = wholeMatrix; - absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); -/* - for (i = 0; i < nBlocksInput; i++) - { - absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) - } -*/ + absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN ); //Initializes M[0] and M[1] - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here + reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); + reducedDuplexRow1_2way( state, &wholeMatrix[0], + &wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols ); do { - //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); + reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64], + &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64], + &wholeMatrix[ 2* row*ROW_LEN_INT64], + nCols ); - //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); - //update prev: it now points to the last row ever computed + rowa0 = (rowa0 + step) & (window - 1); - prev = row; - //updates row: goes to the next row to be computed - row++; + prev = row; + row++; - //Checks if all rows in the window where visited. - if (rowa == 0) - { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } - - } while (row < nRows); + if ( rowa0 == 0 ) + { + step = window + gap; + window *= 2; + gap = -gap; + } + } while ( row < nRows ); //===================== Wandering Phase =============================// - row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= timeCost; tau++) + row = 0; + for ( tau = 1; tau <= timeCost; tau++ ) { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do - { - //Selects a pseudorandom index row* - //----------------------------------------------- - rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1; + do + { + rowa0 = state[ 0 ] & (unsigned int)(nRows-1); + rowa1 = state[ 4 ] & (unsigned int)(nRows-1); - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------- + reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* row *ROW_LEN_INT64 ], + nCols ); + prev = row; - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); - //update prev: it now points to the last row ever computed - prev = row; + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //updates row: goes to the next row to be computed - //---------------------------------------------------- - row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //---------------------------------------------------- - - } while (row != 0); + } while (row != 0); } //===================== Wrap-up Phase ===============================// //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); + absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ], + &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] ); //Squeezes the key - squeeze(state, K, (unsigned int) kLen); + squeeze_2way( state, K, (unsigned int) kLen ); return 0; } +// This version is currently only used by REv3 and has some hard coding +// specific to v3 such as input data size of 32 bytes. +// +// Similarly with REv2. Thedifference with REv3 isn't clear and maybe +// they can be merged. +// +// RE is used by RE, allium. The main difference between RE and REv2 +// in the matrix size. +// +// Z also needs to support 80 byte input as well as 32 byte, and odd +// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h. + + ///////////////////////////////////////////////// // 2 way 256 @@ -223,22 +197,29 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, // Data is interleaved 2x256. int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, - const void *pwd, const uint64_t pwdlen, const void *salt, - const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows, - const uint64_t nCols ) + const void *pwd, uint64_t pwdlen, uint64_t timeCost, + uint64_t nRows, uint64_t nCols ) + +// hard coded for 32 byte input as well as matrix size. +// Other required versions include 80 byte input and different block +// sizez + +//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, +// const void *pwd, const uint64_t pwdlen, const void *salt, +// const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows, +// const uint64_t nCols ) { //====================== Basic variables ============================// - uint64_t _ALIGN(256) state[16]; - int64_t row = 2; //index of row to be processed - int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) - int64_t tau; //Time Loop iterator - int64_t step = 1; //Visitation step (used during Setup and Wandering phases) - int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) - int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 -// int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy - uint64_t instance0 = 0; // Seperate instance for each lane + uint64_t _ALIGN(256) state[32]; + int64_t row = 2; + int64_t prev = 1; + int64_t rowa0 = 0; + int64_t rowa1 = 0; + int64_t tau; + int64_t step = 1; + int64_t window = 2; + int64_t gap = 1; + uint64_t instance0 = 0; uint64_t instance1 = 0; //====================================================================/ @@ -248,7 +229,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, uint64_t *ptrWord = wholeMatrix; // 2 way 256 rewrite. Salt always == password, and data is interleaved, -// need to build in parallel: +// need to build in parallel as pw isalready interleaved. + + // { password, (64 or 80 bytes) // salt, (64 or 80 bytes) = same as password // Klen, (u64) = 32 bytes @@ -262,73 +245,54 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, // 1 (byte) // } -// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows ); +// input is usually 32 maybe 64, both are aligned to 256 bit vector. +// 80 byte inpput is not aligned complicating matters for lyra2z. - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) + int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; + + uint64_t *ptr = wholeMatrix; + uint64_t *pw = (uint64_t*)pwd; - byte *ptrByte = (byte*) wholeMatrix; + memcpy( ptr, pw, 2*pwdlen ); // password + ptr += pwdlen>>2; + memcpy( ptr, pw, 2*pwdlen ); // password lane 1 + ptr += pwdlen>>2; + + // now build the rest interleaving on the fly. - //Prepends the password - memcpy(ptrByte, pwd, pwdlen); - ptrByte += pwdlen; - - //Concatenates the salt - memcpy(ptrByte, salt, saltlen); - ptrByte += saltlen; - - memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - - (saltlen + pwdlen) ); - - //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = pwdlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = saltlen; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = timeCost; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nRows; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - v64 = nCols; - memcpy(ptrByte, &v64, sizeof(int64_t)); - ptrByte += sizeof(uint64_t); - - //Now comes the padding - *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix - ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block - *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block - -// from here on it's all simd acces to state and matrix -// define vector pointers and adjust sizes and pointer offsets + ptr[0] = ptr[ 4] = kLen; + ptr[1] = ptr[ 5] = pwdlen; + ptr[2] = ptr[ 6] = pwdlen; // saltlen + ptr[3] = ptr[ 7] = timeCost; + ptr[8] = ptr[12] = nRows; + ptr[9] = ptr[13] = nCols; + ptr[10] = ptr[14] = 0x80; + ptr[11] = ptr[15] = 0x0100000000000000; ptrWord = wholeMatrix; - absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); + absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN ); - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); + reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); + + reducedDuplexRow1_2way( state, &wholeMatrix[0], + &wholeMatrix[2*ROW_LEN_INT64], nCols ); do { - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); + reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ], + &wholeMatrix[ 2* row*ROW_LEN_INT64 ], + nCols ); - rowa = (rowa + step) & (window - 1); + rowa0 = (rowa0 + step) & (window - 1); prev = row; row++; - if (rowa == 0) + if (rowa0 == 0) { step = window + gap; //changes the step: approximately doubles its value window *= 2; //doubles the size of the re-visitation window @@ -340,37 +304,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, row = 0; for (tau = 1; tau <= timeCost; tau++) { - step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1; do { - // This part is not parallel, rowa will be different for each lane. - // state (u64[16]) is interleaved 2x256, need to extract seperately. + instance0 = state[ offset_to_index( instance0 ) ]; + instance1 = (&state[4])[ offset_to_index( instance1 ) ]; - // index = 2 * instance / 4 * 4 + instance % 4 - uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 ) - + ( instance0 & 0x3 ) - uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 ) - + ( instance1 & 0x3 ) + rowa0 = state[ offset_to_index( instance0 ) ] + & (unsigned int)(nRows-1); + rowa1 = (state+4)[ offset_to_index( instance1 ) ] + & (unsigned int)(nRows-1); - instance0 = state[ index0 ] & 0xf; - instance1 = (state+4)[ index1 ] & 0xf; - - rowa0 = state[ instance0 ]; - rowa1 = (state+4)[ instance1 ]; - - reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa0*ROW_LEN_INT64], - &wholeMatrix[rowa1*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); -/* - instance = state[instance & 0xF]; - rowa = state[instance & 0xF] & (unsigned int)(nRows-1); - - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); -*/ - // End of divergence. + reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* row*ROW_LEN_INT64 ], + nCols ); prev = row; row = (row + step) & (unsigned int)(nRows-1); @@ -378,13 +327,17 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen, } while ( row != 0 ); } - absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] ); - squeeze( state, K, (unsigned int) kLen ); + absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64], + &wholeMatrix[2*rowa1*ROW_LEN_INT64] ); + + squeeze_2way( state, K, (unsigned int) kLen ); return 0; } +#endif // AVX512 +#if 0 ////////////////////////////////////////////////// int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, @@ -532,22 +485,26 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, return 0; } +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + // Lyra2RE doesn't like the new wholeMatrix implementation -int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, - const void *salt, const uint64_t saltlen, const uint64_t timeCost, - const uint64_t nRows, const uint64_t nCols ) +int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, + const uint64_t pwdlen, const uint64_t timeCost, + const uint64_t nRows, const uint64_t nCols ) { //====================== Basic variables ============================// uint64_t _ALIGN(256) state[16]; int64_t row = 2; //index of row to be processed int64_t prev = 1; //index of prev (last row ever computed/modified) - int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t rowa0 = 0; + int64_t rowa1 = 0; int64_t tau; //Time Loop iterator int64_t step = 1; //Visitation step (used during Setup and Wandering phases) int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 int64_t i; //auxiliary iteration counter - int64_t v64; // 64bit var for memcpy //====================================================================/ //=== Initializing the Memory Matrix and pointers to it =============// @@ -573,15 +530,36 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, #endif uint64_t *ptrWord = wholeMatrix; + uint64_t *pw = (uint64_t*)pwd; //=== Getting the password + salt + basil padded with 10*1 ==========// //OBS.:The memory matrix will temporarily hold the password: not for saving memory, //but this ensures that the password copied locally will be overwritten as soon as possible //First, we clean enough blocks for the password, salt, basil and padding - int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) ) + int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; + uint64_t *ptr = wholeMatrix; + + memcpy( ptr, pw, 2*pwdlen ); // password + ptr += pwdlen>>2; + memcpy( ptr, pw, 2*pwdlen ); // password lane 1 + ptr += pwdlen>>2; + + // now build the rest interleaving on the fly. + + ptr[0] = ptr[ 4] = kLen; + ptr[1] = ptr[ 5] = pwdlen; + ptr[2] = ptr[ 6] = pwdlen; // saltlen + ptr[3] = ptr[ 7] = timeCost; + ptr[8] = ptr[12] = nRows; + ptr[9] = ptr[13] = nCols; + ptr[10] = ptr[14] = 0x80; + ptr[11] = ptr[15] = 0x0100000000000000; + + +/* byte *ptrByte = (byte*) wholeMatrix; //Prepends the password @@ -630,7 +608,9 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, ptrWord = wholeMatrix; - absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); +*/ + + absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN ); /* for (i = 0; i < nBlocksInput; i++) { @@ -639,21 +619,22 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, } */ //Initializes M[0] and M[1] - reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here + reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here - reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], - nCols); + reducedDuplexRow1_2way( state, &wholeMatrix[0], + &wholeMatrix[ 2 * ROW_LEN_INT64], nCols ); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); + reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ], + &wholeMatrix[ 2* row*ROW_LEN_INT64 ], + nCols ); //updates the value of row* (deterministically picked during Setup)) - rowa = (rowa + step) & (window - 1); + rowa0 = (rowa0 + step) & (window - 1); //update prev: it now points to the last row ever computed prev = row; @@ -661,7 +642,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, row++; //Checks if all rows in the window where visited. - if (rowa == 0) + if (rowa0 == 0) { step = window + gap; //changes the step: approximately doubles its value window *= 2; //doubles the size of the re-visitation window @@ -674,21 +655,18 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, row = 0; //Resets the visitation to the first row of the memory matrix for (tau = 1; tau <= timeCost; tau++) { - //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; - do - { - //Selects a pseudorandom index row* - //----------------------------------------------- - rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + do + { + rowa0 = state[ 0 ] & (unsigned int)(nRows-1); + rowa1 = state[ 4 ] & (unsigned int)(nRows-1); - //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) - //------------------------------------------- + reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* row *ROW_LEN_INT64 ], + nCols ); - //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64], - &wholeMatrix[rowa*ROW_LEN_INT64], - &wholeMatrix[row*ROW_LEN_INT64], nCols ); //update prev: it now points to the last row ever computed prev = row; @@ -703,9 +681,10 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, //===================== Wrap-up Phase ===============================// //Absorbs the last block of the memory matrix - absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]); + absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64], + &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] ); //Squeezes the key - squeeze(state, K, (unsigned int) kLen); + squeeze_2way( state, K, (unsigned int) kLen ); //================== Freeing the memory =============================// _mm_free(wholeMatrix); @@ -713,3 +692,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, return 0; } +#endif diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 8db05dc..970c612 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); - do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index 3c2399e..483ca2f 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd, int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + +int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, + uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + +#endif + #endif /* LYRA2_H_ */ diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index 9832fb1..f2954c3 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -1,13 +1,150 @@ #include "lyra2-gate.h" #include - -#if defined (LYRA2REV2_4WAY) - #include "algo/blake/blake-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/skein/skein-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" + +#if defined (LYRA2REV2_8WAY) + +typedef struct { + blake256_8way_context blake; + keccak256_8way_context keccak; + cube_4way_context cube; + skein256_8way_context skein; + bmw256_8way_context bmw; +} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64))); + +static lyra2v2_8way_ctx_holder l2v2_8way_ctx; + +bool init_lyra2rev2_8way_ctx() +{ + keccak256_8way_init( &l2v2_8way_ctx.keccak ); + cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 ); + skein256_8way_init( &l2v2_8way_ctx.skein ); + bmw256_8way_init( &l2v2_8way_ctx.bmw ); + return true; +} + +void lyra2rev2_8way_hash( void *state, const void *input ) +{ + uint32_t vhash[8*8] __attribute__ ((aligned (128))); + uint32_t vhashA[8*8] __attribute__ ((aligned (64))); + uint32_t vhashB[8*8] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (64))); + uint32_t hash1[8] __attribute__ ((aligned (64))); + uint32_t hash2[8] __attribute__ ((aligned (64))); + uint32_t hash3[8] __attribute__ ((aligned (64))); + uint32_t hash4[8] __attribute__ ((aligned (64))); + uint32_t hash5[8] __attribute__ ((aligned (64))); + uint32_t hash6[8] __attribute__ ((aligned (64))); + uint32_t hash7[8] __attribute__ ((aligned (64))); + lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64))); + memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) ); + + blake256_8way( &ctx.blake, input + (64<<3), 16 ); + blake256_8way_close( &ctx.blake, vhash ); + + rintrlv_8x32_8x64( vhashA, vhash, 256 ); + + keccak256_8way_update( &ctx.keccak, vhashA, 32 ); + keccak256_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 ); + + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 ); + + intrlv_2x256( vhash, hash0, hash1, 256 ); + LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash0, hash1, vhash, 256 ); + intrlv_2x256( vhash, hash2, hash3, 256 ); + LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash2, hash3, vhash, 256 ); + intrlv_2x256( vhash, hash4, hash5, 256 ); + LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash4, hash5, vhash, 256 ); + intrlv_2x256( vhash, hash6, hash7, 256 ); + LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash6, hash7, vhash, 256 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, 256 ); + + skein256_8way_update( &ctx.skein, vhash, 32 ); + skein256_8way_close( &ctx.skein, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 ); + + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 ); + + intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, 256 ); + + bmw256_8way_update( &ctx.bmw, vhash, 32 ); + bmw256_8way_close( &ctx.bmw, state ); +} + +int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + const uint32_t Htarg = ptarget[7]; + __m256i *noncev = (__m256i*)vdata + 19; // aligned + int thr_id = mythr->id; + + if ( opt_benchmark ) + ( (uint32_t*)ptarget )[7] = 0x0000ff; + + mm256_bswap32_intrlv80_8x32( vdata, pdata ); + + blake256_8way_init( &l2v2_8way_ctx.blake ); + blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 ); + + do + { + *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, + n+3, n+2, n+1, n ) ); + + lyra2rev2_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < last_nonce) && !work_restart[thr_id].restart); + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (LYRA2REV2_4WAY) typedef struct { blake256_4way_context blake; diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 467a1a6..6e560be 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -4,8 +4,180 @@ #include "algo/blake/blake-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" -#if defined (LYRA2REV3_8WAY) +#if defined (LYRA2REV3_16WAY) + +typedef struct { + blake256_16way_context blake; + cube_4way_context cube; + bmw256_16way_context bmw; +} lyra2v3_16way_ctx_holder; + +static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx; + +bool init_lyra2rev3_16way_ctx() +{ + blake256_16way_init( &l2v3_16way_ctx.blake ); + cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 ); + bmw256_16way_init( &l2v3_16way_ctx.bmw ); + return true; +} + +void lyra2rev3_16way_hash( void *state, const void *input ) +{ + uint32_t vhash[16*8] __attribute__ ((aligned (128))); + uint32_t hash0[8] __attribute__ ((aligned (64))); + uint32_t hash1[8] __attribute__ ((aligned (64))); + uint32_t hash2[8] __attribute__ ((aligned (64))); + uint32_t hash3[8] __attribute__ ((aligned (64))); + uint32_t hash4[8] __attribute__ ((aligned (64))); + uint32_t hash5[8] __attribute__ ((aligned (64))); + uint32_t hash6[8] __attribute__ ((aligned (64))); + uint32_t hash7[8] __attribute__ ((aligned (64))); + uint32_t hash8[8] __attribute__ ((aligned (64))); + uint32_t hash9[8] __attribute__ ((aligned (64))); + uint32_t hash10[8] __attribute__ ((aligned (64))); + uint32_t hash11[8] __attribute__ ((aligned (64))); + uint32_t hash12[8] __attribute__ ((aligned (64))); + uint32_t hash13[8] __attribute__ ((aligned (64))); + uint32_t hash14[8] __attribute__ ((aligned (64))); + uint32_t hash15[8] __attribute__ ((aligned (64))); + lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64))); + memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) ); + + blake256_16way_update( &ctx.blake, input + (64*16), 16 ); + blake256_16way_close( &ctx.blake, vhash ); + + dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15, + vhash, 256 ); + + intrlv_2x256( vhash, hash0, hash1, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash0, hash1, vhash, 256 ); + intrlv_2x256( vhash, hash2, hash3, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash2, hash3, vhash, 256 ); + intrlv_2x256( vhash, hash4, hash5, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash4, hash5, vhash, 256 ); + intrlv_2x256( vhash, hash6, hash7, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash6, hash7, vhash, 256 ); + intrlv_2x256( vhash, hash8, hash9, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash8, hash9, vhash, 256 ); + intrlv_2x256( vhash, hash10, hash11, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash10, hash11, vhash, 256 ); + intrlv_2x256( vhash, hash12, hash13, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash12, hash13, vhash, 256 ); + intrlv_2x256( vhash, hash14, hash15, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash14, hash15, vhash, 256 ); + + intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 32 ); + dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 ); + intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 32 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 ); + intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 32 ); + dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 ); + intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 32 ); + dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 ); + + intrlv_2x256( vhash, hash0, hash1, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash0, hash1, vhash, 256 ); + intrlv_2x256( vhash, hash2, hash3, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash2, hash3, vhash, 256 ); + intrlv_2x256( vhash, hash4, hash5, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash4, hash5, vhash, 256 ); + intrlv_2x256( vhash, hash6, hash7, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash6, hash7, vhash, 256 ); + intrlv_2x256( vhash, hash8, hash9, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash8, hash9, vhash, 256 ); + intrlv_2x256( vhash, hash10, hash11, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash10, hash11, vhash, 256 ); + intrlv_2x256( vhash, hash12, hash13, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash12, hash13, vhash, 256 ); + intrlv_2x256( vhash, hash14, hash15, 256 ); + LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash14, hash15, vhash, 256 ); + + intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14, + hash15, 256 ); + + bmw256_16way_update( &ctx.bmw, vhash, 32 ); + bmw256_16way_close( &ctx.bmw, state ); +} + + +int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[20*16] __attribute__ ((aligned (64))); + uint32_t *hash7 = &hash[7<<4]; + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 16; + const uint32_t Htarg = ptarget[7]; + __m512i *noncev = (__m512i*)vdata + 19; // aligned + const int thr_id = mythr->id; + + if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; + + mm512_bswap32_intrlv80_16x32( vdata, pdata ); + + blake256_16way_init( &l2v3_16way_ctx.blake ); + blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 ); + + do + { + *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12, + n+11, n+10, n+ 9, n+ 8, + n+ 7, n+ 6, n+ 5, n+ 4, + n+ 3, n+ 2, n+ 1, n ) ); + + lyra2rev3_16way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash7[lane] <= Htarg ) ) + { + extr_lane_16x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 16; + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (LYRA2REV3_8WAY) typedef struct { blake256_8way_context blake; diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c index 35c20cc..bb92082 100644 --- a/algo/lyra2/sponge-2way.c +++ b/algo/lyra2/sponge-2way.c @@ -19,7 +19,7 @@ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "algo-gate.h" +//#include "algo-gate.h" #include #include #include @@ -40,19 +40,26 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len ) //Squeezes full blocks for ( i = 0; i < fullBlocks; i++ ) { - memcpy_512( out, state, BLOCK_LEN_M256I*2 ); - LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] ); - out += BLOCK_LEN_M256I*2; + memcpy_512( out, state, BLOCK_LEN_M256I ); + LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] ); + out += BLOCK_LEN_M256I; } //Squeezes remaining bytes - memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) ); + memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I ); } -inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) +inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0, + const uint64_t *In1 ) { register __m512i state0, state1, state2, state3; - __m512i *in = (__m512i*)In; - + __m512i in[3]; + casti_m256i( in, 0 ) = casti_m256i( In0, 0 ); + casti_m256i( in, 1 ) = casti_m256i( In1, 1 ); + casti_m256i( in, 2 ) = casti_m256i( In0, 2 ); + casti_m256i( in, 3 ) = casti_m256i( In1, 3 ); + casti_m256i( in, 4 ) = casti_m256i( In0, 4 ); + casti_m256i( in, 5 ) = casti_m256i( In1, 5 ); + state0 = _mm512_load_si512( (__m512i*)State ); state1 = _mm512_load_si512( (__m512i*)State + 1 ); state2 = _mm512_load_si512( (__m512i*)State + 2 ); @@ -90,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In, state1 = _mm512_xor_si512( state1, in[1] ); LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 ); - In += block_len * 2; + In += block_len*2; } _mm512_store_si512( (__m512i*)State, state0 ); @@ -109,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut, register __m512i state0, state1, state2, state3; - __m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 ); + __m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); state0 = _mm512_load_si512( (__m512i*)State ); state1 = _mm512_load_si512( (__m512i*)State + 1 ); @@ -126,13 +133,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut, { _mm_prefetch( out - 9, _MM_HINT_T0 ); _mm_prefetch( out - 11, _MM_HINT_T0 ); - + out[0] = state0; out[1] = state1; out[2] = state2; //Goes to next block (column) that will receive the squeezed data - out -= BLOCK_LEN_M256I * 2; + out -= BLOCK_LEN_M256I; LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); } @@ -143,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut, _mm512_store_si512( (__m512i*)State + 3, state3 ); } -// This function has to deal with gathering 2 256 bit rowin vectors from -// non-contiguous memory. Extra work and performance penalty. inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols ) { int i; register __m512i state0, state1, state2, state3; - __m512i *in = (__m256i*)rowIn; + __m512i *in = (__m512i*)rowIn; + __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); state0 = _mm512_load_si512( (__m512i*)State ); state1 = _mm512_load_si512( (__m512i*)State + 1 ); @@ -171,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn, out[2] = _mm512_xor_si512( state2, in[2] ); //Input: next column (i.e., next block in sequence) - in0 += BLOCK_LEN_M256I; - in1 += BLOCK_LEN_M256I; + in += BLOCK_LEN_M256I; //Output: goes to previous column - out -= BLOCK_LEN_M256I * 2; + out -= BLOCK_LEN_M256I; } - _mm512_store_si256( (__m512i*)State, state0 ); - _mm512_store_si256( (__m512i*)State + 1, state1 ); - _mm512_store_si256( (__m512i*)State + 2, state2 ); - _mm512_store_si256( (__m512i*)State + 3, state3 ); - } + _mm512_store_si512( (__m512i*)State, state0 ); + _mm512_store_si512( (__m512i*)State + 1, state1 ); + _mm512_store_si512( (__m512i*)State + 2, state2 ); + _mm512_store_si512( (__m512i*)State + 3, state3 ); } inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ) { int i; - register __m512i state0, state1, state2, state3; __m512i* in = (__m512i*)rowIn; __m512i* inout = (__m512i*)rowInOut; - __m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 ); + __m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I ); __m512i t0, t1, t2; state0 = _mm512_load_si512( (__m512i*)State ); @@ -209,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], inout[2] ) ); - LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 ); + LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); out[0] = _mm512_xor_si512( state0, in[0] ); out[1] = _mm512_xor_si512( state1, in[1] ); @@ -221,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, t2 = _mm512_permutex_epi64( state2, 0x93 ); inout[0] = _mm512_xor_si512( inout[0], - _mm512_mask_blend_epi32( t0, t2, 0x03 ) ); + _mm512_mask_blend_epi32( 0x0303, t0, t2 ) ); inout[1] = _mm512_xor_si512( inout[1], - _mm512_mask_blend_epi32( t1, t0, 0x03 ) ); + _mm512_mask_blend_epi32( 0x0303, t1, t0 ) ); inout[2] = _mm512_xor_si512( inout[2], - _mm512_mask_blend_epi32( t2, t1, 0x03 ) ); + _mm512_mask_blend_epi32( 0x0303, t2, t1 ) ); + //Inputs: next column (i.e., next block in sequence) - in += BLOCK_LEN_M256I * 2; - inout += BLOCK_LEN_M256I * 2; + in += BLOCK_LEN_M256I; + inout += BLOCK_LEN_M256I; //Output: goes to previous column - out -= BLOCK_LEN_M256I * 2; + out -= BLOCK_LEN_M256I; } _mm512_store_si512( (__m512i*)State, state0 ); @@ -240,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, _mm512_store_si512( (__m512i*)State + 3, state3 ); } -inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1, - uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, - uint64_t nCols ) +// big ugly workaound for pointer aliasing, use a union of pointers. +// Access matrix using m512i for in and out, m256i for inout + +inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols) { int i; - register __m512i state0, state1, state2, state3; - __m256i *in0 = (__m256i*)rowIn0; - __m256i *in0 = (__m256i*)rowIn0; - __m2512* in = (__m512i*)rowIn; - __m2512* inout = (__m512i*)rowInOut; - __m512i* out = (__m512i*)rowOut; - __m512i t0, t1, t2; + __m512i *in = (__m512i*)rowIn; + __m256i *inout0 = (__m256i*)rowInOut0; + __m256i *inout1 = (__m256i*)rowInOut1; + __m512i *out = (__m512i*)rowOut; + __m512i io[3]; + povly inout; + inout.v512 = &io[0]; + __m512i t0, t1, t2; - _mm_prefetch( in0, _MM_HINT_T0 ); - _mm_prefetch( in1, _MM_HINT_T0 ); - _mm_prefetch( in0 + 2, _MM_HINT_T0 ); - _mm_prefetch( in1 + 2, _MM_HINT_T0 ); - _mm_prefetch( in0 + 4, _MM_HINT_T0 ); - _mm_prefetch( in1 + 4, _MM_HINT_T0 ); - _mm_prefetch( in0 + 6, _MM_HINT_T0 ); - _mm_prefetch( in1 + 6, _MM_HINT_T0 ); - state0 = _mm512_load_si512( (__m512i*)State ); state1 = _mm512_load_si512( (__m512i*)State + 1 ); state2 = _mm512_load_si512( (__m512i*)State + 2 ); state3 = _mm512_load_si512( (__m512i*)State + 3 ); + + _mm_prefetch( in, _MM_HINT_T0 ); + _mm_prefetch( inout0, _MM_HINT_T0 ); + _mm_prefetch( inout1, _MM_HINT_T0 ); + _mm_prefetch( in + 2, _MM_HINT_T0 ); + _mm_prefetch( inout0 + 2, _MM_HINT_T0 ); + _mm_prefetch( inout1 + 2, _MM_HINT_T0 ); + _mm_prefetch( in + 4, _MM_HINT_T0 ); + _mm_prefetch( inout0 + 4, _MM_HINT_T0 ); + _mm_prefetch( inout1 + 4, _MM_HINT_T0 ); + _mm_prefetch( in + 6, _MM_HINT_T0 ); + _mm_prefetch( inout0 + 6, _MM_HINT_T0 ); + _mm_prefetch( inout1 + 6, _MM_HINT_T0 ); + + + for ( i = 0; i < nCols; i++ ) + { //Absorbing "M[prev] [+] M[row*]" + inout.v256[0] = inout0[0]; + inout.v256[1] = inout1[1]; + inout.v256[2] = inout0[2]; + inout.v256[3] = inout1[3]; + inout.v256[4] = inout0[4]; + inout.v256[5] = inout1[5]; -// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] ); -// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] ); -// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] ); - t0 = mm512_concat_256( in1[0], in0[0] ); - t1 = mm512_concat_256( in1[1], in0[1] ); - t2 = mm512_concat_256( in1[2], in0[2] ); - state0 = _mm512_xor_si512( state0, - _mm512_add_epi64( t0, inout[0] ) ); + _mm512_add_epi64( in[0], inout.v512[0] ) ); state1 = _mm512_xor_si512( state1, - _mm512_add_epi64( t1, inout[1] ) ); + _mm512_add_epi64( in[1], inout.v512[1] ) ); state2 = _mm512_xor_si512( state2, - _mm512_add_epi64( t2, inout[2] ) ); + _mm512_add_epi64( in[2], inout.v512[2] ) ); + //Applies the reduced-round transformation f to the sponge's state LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); @@ -292,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1, out[1] = _mm512_xor_si512( out[1], state1 ); out[2] = _mm512_xor_si512( out[2], state2 ); + // if inout is the same row as out it was just overwritten, reload. + if ( rowOut == rowInOut0 ) + { + inout.v256[0] = inout0[0]; + inout.v256[2] = inout0[2]; + inout.v256[4] = inout0[4]; + } + if ( rowOut == rowInOut1 ) + { + inout.v256[1] = inout1[1]; + inout.v256[3] = inout1[3]; + inout.v256[5] = inout1[5]; + } + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) t0 = _mm512_permutex_epi64( state0, 0x93 ); t1 = _mm512_permutex_epi64( state1, 0x93 ); t2 = _mm512_permutex_epi64( state2, 0x93 ); - inout[0] = _mm512_xor_si512( inout[0], - _mm512_mask_blend_epi32( t0, t2, 0x03 ) ); - inout[1] = _mm512_xor_si512( inout[1], - _mm512_mask_blend_epi32( t1, t0, 0x03 ) ); - inout[2] = _mm512_xor_si512( inout[2], - _mm512_mask_blend_epi32( t2, t1, 0x03 ) ); + inout.v512[0] = _mm512_xor_si512( inout.v512[0], + _mm512_mask_blend_epi32( 0x0303, t0, t2 ) ); + inout.v512[1] = _mm512_xor_si512( inout.v512[1], + _mm512_mask_blend_epi32( 0x0303, t1, t0 ) ); + inout.v512[2] = _mm512_xor_si512( inout.v512[2], + _mm512_mask_blend_epi32( 0x0303, t2, t1 ) ); + + inout0[0] = inout.v256[0]; + inout1[1] = inout.v256[1]; + inout0[2] = inout.v256[2]; + inout1[3] = inout.v256[3]; + inout0[4] = inout.v256[4]; + inout1[5] = inout.v256[5]; //Goes to next block - in += BLOCK_LEN_M256I * 2; - out += BLOCK_LEN_M256I * 2; - inout += BLOCK_LEN_M256I * 2; + in += BLOCK_LEN_M256I; + inout0 += BLOCK_LEN_M256I * 2; + inout1 += BLOCK_LEN_M256I * 2; + out += BLOCK_LEN_M256I; } _mm512_store_si512( (__m512i*)State, state0 ); diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c index 5a8e71b..9f400b5 100644 --- a/algo/lyra2/sponge.c +++ b/algo/lyra2/sponge.c @@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut, { _mm_prefetch( out - 9, _MM_HINT_T0 ); _mm_prefetch( out - 11, _MM_HINT_T0 ); - + +//printf("S RSR0 col= %d, out= %x\n",i,out); + + out[0] = state0; out[1] = state1; out[2] = state2; @@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn, out[1] = _mm256_xor_si256( state1, in[1] ); out[2] = _mm256_xor_si256( state2, in[2] ); +/* +printf("s duplexsetup col= %d\n",i); +uint64_t * o = (uint64_t*)out; +printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); +printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); +printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); +printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); +printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); +printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); +*/ + //M[row*][col] = M[row*][col] XOR rotW(rand) t0 = _mm256_permute4x64_epi64( state0, 0x93 ); t1 = _mm256_permute4x64_epi64( state1, 0x93 ); t2 = _mm256_permute4x64_epi64( state2, 0x93 ); +/* +uint64_t *t = (uint64_t*)&t0; +printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]); + +o = (uint64_t*)inout; +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); +printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); +*/ inout[0] = _mm256_xor_si256( inout[0], _mm256_blend_epi32( t0, t2, 0x03 ) ); inout[1] = _mm256_xor_si256( inout[1], @@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn, inout[2] = _mm256_xor_si256( inout[2], _mm256_blend_epi32( t2, t1, 0x03 ) ); - //Inputs: next column (i.e., next block in sequence) +/* +o = (uint64_t*)inout; +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]); +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]); +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]); +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]); +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]); +printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]); +*/ + +//Inputs: next column (i.e., next block in sequence) in += BLOCK_LEN_M256I; inout += BLOCK_LEN_M256I; //Output: goes to previous column @@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn, _mm_prefetch( inout + 9, _MM_HINT_T0 ); _mm_prefetch( inout + 11, _MM_HINT_T0 ); +/* +uint64_t *io = (uint64_t*)inout; +uint64_t *ii = (uint64_t*)in; + +printf("RDRS1 col= %d\n", i); +printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]); +printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]); +printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]); +printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]); +printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]); +printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]); +printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]); +printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]); +*/ + + //Absorbing "M[prev] [+] M[row*]" state0 = _mm256_xor_si256( state0, _mm256_add_epi64( in[0], inout[0] ) ); diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 80f1d4f..185181b 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 ); #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ - G_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_ror_1x64( s1); \ - s2 = mm512_swap128_256( s2 ); \ - s3 = mm512_rol1x64_256( s3 ); \ - G_4X64( s0, s1, s2, s3 ); \ - s1 = mm512_rol1x64_256( s1 ); \ - s2 = mm512_swap128_256( s2 ); \ - s3 = mm512_ror1x64_256( s3 ); + G2W_4X64( s0, s1, s2, s3 ); \ + s1 = mm512_ror256_64( s1); \ + s2 = mm512_swap256_128( s2 ); \ + s3 = mm512_rol256_64( s3 ); \ + G2W_4X64( s0, s1, s2, s3 ); \ + s1 = mm512_rol256_64( s1 ); \ + s2 = mm512_swap256_128( s2 ); \ + s3 = mm512_ror256_64( s3 ); #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \ LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ @@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_ror1x64_256( s2, s3 ); \ - mm128_swap128_256( s4, s5 ); \ - mm128_rol1x64_256( s6, s7 ); \ + mm128_ror256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); \ + mm128_rol256_64( s6, s7 ); \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_rol1x64_256( s2, s3 ); \ - mm128_swap128_256( s4, s5 ); \ - mm128_ror1x64_256( s6, s7 ); + mm128_rol256_64( s2, s3 ); \ + mm128_swap256_128( s4, s5 ); \ + mm128_ror256_64( s6, s7 ); #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ @@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +union _povly +{ + __m512i *v512; + __m256i *v256; + uint64_t *u64; +}; +typedef union _povly povly; + //---- Housekeeping -void initState_2way( uint64_t state[/*16*/] ); +void initState_2way( uint64_t State[/*16*/] ); //---- Squeezes -void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len ); +void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len ); void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols ); //---- Absorbs -void absorbBlock_2way( uint64_t *state, const uint64_t *in ); -void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in, +void absorbBlock_2way( uint64_t *State, const uint64_t *In0, + const uint64_t *In1 ); +void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In, const uint64_t nBlocks, const uint64_t block_len ); //---- Duplexes -void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn, +void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); -void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn, +void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols ); -void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); + +void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols); #endif diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 101a5c2..9f22d29 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -1,7 +1,4 @@ #include "hmq1725-gate.h" - -#if defined(HMQ1725_4WAY) - #include #include #include "algo/blake/blake-hash-4way.h" @@ -11,6 +8,8 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/shavite/sph_shavite.h" @@ -23,6 +22,772 @@ #include "algo/haval/haval-hash-4way.h" #include "algo/sha/sha-hash-4way.h" +#if defined(HMQ1725_8WAY) + +union _hmq1725_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + haval256_5_8way_context haval; +} __attribute__ ((aligned (64))); + +typedef union _hmq1725_8way_context_overlay hmq1725_8way_context_overlay; + +extern void hmq1725_8way_hash(void *state, const void *input) +{ + uint32_t vhash [16<<3] __attribute__ ((aligned (128))); + uint32_t vhashA[16<<3] __attribute__ ((aligned (64))); + uint32_t vhashB[16<<3] __attribute__ ((aligned (64))); + uint32_t hash0 [16] __attribute__ ((aligned (64))); + uint32_t hash1 [16] __attribute__ ((aligned (64))); + uint32_t hash2 [16] __attribute__ ((aligned (64))); + uint32_t hash3 [16] __attribute__ ((aligned (64))); + uint32_t hash4 [16] __attribute__ ((aligned (64))); + uint32_t hash5 [16] __attribute__ ((aligned (64))); + uint32_t hash6 [16] __attribute__ ((aligned (64))); + uint32_t hash7 [16] __attribute__ ((aligned (64))); + hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64))); + __mmask8 vh_mask; + const __m512i vmask = m512_const1_64( 24 ); + const uint32_t mask = 24; + __m512i* vh = (__m512i*)vhash; + __m512i* vhA = (__m512i*)vhashA; + __m512i* vhB = (__m512i*)vhashB; + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, input, 80 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + + // A + if ( hash0[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); + } + if ( hash1[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); + } + if ( hash2[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); + } + if ( hash3[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); + } + if ( hash4[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (char*)hash4, 512 ); + } + if ( hash5[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (char*)hash5, 512 ); + } + if ( hash6[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (char*)hash6, 512 ); + } + if ( hash7[0] & mask ) + { + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (char*)hash7, 512 ); + } + + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + // B + if ( likely( vh_mask & 0xff ) ) + { + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhashB ); + } + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + + if ( likely( ( vh_mask & 0xff ) != 0xff ) ) + { + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, vhash, 64 ); + blake512_8way_close( &ctx.blake, vhashA ); + } + + if ( likely( vh_mask & 0xff ) ) + { + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhashB ); + } + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + + if ( likely( ( vh_mask & 0xff ) != 0xff ) ) + { + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhashA ); + } + + if ( likely( vh_mask & 0xff ) ) + { + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhashB ); + } + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512 ( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + // 4x32 for haval + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + // A + if ( hash0[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + } + if ( hash1[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + } + if ( hash2[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + } + if ( hash3[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + if ( hash4[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + } + if ( hash5[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + } + if ( hash6[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + } + if ( hash7[0] & mask ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + // B + if ( likely( vh_mask & 0xff ) ) + { + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhash, 64 ); + haval256_5_8way_close( &ctx.haval, vhash ); + memset( &vhash[8<<3], 0, 32<<3 ); + rintrlv_8x32_8x64( vhashB, vhash, 512 ); + } + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *)hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *)hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *)hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, vhash, 64 ); + blake512_8way_close( &ctx.blake, vhash ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + // A + if ( hash0[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); // + sph_shavite512_close( &ctx.shavite, hash0 ); //8 + } + if ( hash1[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); // + sph_shavite512_close( &ctx.shavite, hash1 ); //8 + } + if ( hash2[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); // + sph_shavite512_close( &ctx.shavite, hash2 ); //8 + } + if ( hash3[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); // + sph_shavite512_close( &ctx.shavite, hash3 ); //8 + } + if ( hash4[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); // + sph_shavite512_close( &ctx.shavite, hash4 ); //8 + } + if ( hash5[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); // + sph_shavite512_close( &ctx.shavite, hash5 ); //8 + } + if ( hash6[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); // + sph_shavite512_close( &ctx.shavite, hash6 ); //8 + } + if ( hash7[0] & mask ) + { + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); // + sph_shavite512_close( &ctx.shavite, hash7 ); //8 + } + + // B + if ( likely( vh_mask & 0xff ) ) + { + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 ); + rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); + } + + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( hash0[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *)hash0, 512 ); + } + if ( hash1[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *)hash1, 512 ); + } + if ( hash2[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *)hash2, 512 ); + } + if ( hash3[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *)hash3, 512 ); + } + if ( hash4[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *)hash4, 512 ); + } + if ( hash5[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *)hash5, 512 ); + } + if ( hash6[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *)hash6, 512 ); + } + if ( hash7[0] & mask ) //4 + { + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *)hash7, 512 ); + } + + // B + if ( likely( vh_mask & 0xff ) ) + { + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 ); + rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); + } + + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + rintrlv_8x64_8x32( vhashA, vhash, 512 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhashA, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + // A + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + + if ( hash0[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + } + if ( hash1[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + } + if ( hash2[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + } + if ( hash3[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + } + if ( hash4[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + } + if ( hash5[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + } + if ( hash6[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + } + if ( hash7[0] & mask ) + { + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + } + + intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + // B + if ( likely( vh_mask & 0xff ) ) + { + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhashB ); + } + + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + + vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), + m512_zero ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // A + if ( likely( ( vh_mask & 0xff ) != 0xff ) ) + { + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhash, 64 ); + haval256_5_8way_close( &ctx.haval, vhash ); + memset( &vhash[8<<3], 0, 32<<3 ); + rintrlv_8x32_8x64( vhashA, vhash, 512 ); + } + + // B + if ( !( hash0[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + } + if ( !( hash1[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + } + if ( !( hash2[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + } + if ( !( hash3[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + if ( !( hash4[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + } + if ( !( hash5[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + } + if ( !( hash6[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + } + if ( !( hash7[0] & mask ) ) + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + + intrlv_8x64_512( vhashB, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, state ); +} + +int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 4; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + hmq1725_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane<<1 ] <= Htarg ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(HMQ1725_4WAY) + union _hmq1725_4way_context_overlay { blake512_4way_context blake; @@ -34,7 +799,8 @@ union _hmq1725_4way_context_overlay hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + hashState_sd sd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -42,19 +808,19 @@ union _hmq1725_4way_context_overlay sph_whirlpool_context whirlpool; sha512_4way_context sha512; haval256_5_4way_context haval; -}; +} __attribute__ ((aligned (64))); + typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay; extern void hmq1725_4way_hash(void *state, const void *input) { -// why so big? only really need 16. - uint32_t hash0 [32] __attribute__ ((aligned (64))); - uint32_t hash1 [32] __attribute__ ((aligned (64))); - uint32_t hash2 [32] __attribute__ ((aligned (64))); - uint32_t hash3 [32] __attribute__ ((aligned (64))); - uint32_t vhash [32<<2] __attribute__ ((aligned (64))); - uint32_t vhashA[32<<2] __attribute__ ((aligned (64))); - uint32_t vhashB[32<<2] __attribute__ ((aligned (64))); + uint32_t hash0 [16] __attribute__ ((aligned (64))); + uint32_t hash1 [16] __attribute__ ((aligned (64))); + uint32_t hash2 [16] __attribute__ ((aligned (64))); + uint32_t hash3 [16] __attribute__ ((aligned (64))); + uint32_t vhash [16<<2] __attribute__ ((aligned (64))); + uint32_t vhashA[16<<2] __attribute__ ((aligned (64))); + uint32_t vhashB[16<<2] __attribute__ ((aligned (64))); hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); __m256i vh_mask; const __m256i vmask = m256_const1_64( 24 ); @@ -91,41 +857,41 @@ extern void hmq1725_4way_hash(void *state, const void *input) // A -// if ( hash0[0] & mask ) -// { + if ( hash0[0] & mask ) + { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); -// } -// if ( hash1[0] & mask ) -// { + } + if ( hash1[0] & mask ) + { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); -// } -// if ( hash2[0] & mask ) -// { + } + if ( hash2[0] & mask ) + { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); -// } -// if ( hash3[0] & mask ) -// { + } + if ( hash3[0] & mask ) + { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); -// } + } intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); // B -// if ( mm256_any_clr_256( vh_mask ) ) -// { + if ( mm256_anybits1( vh_mask ) ) + { skein512_4way_init( &ctx.skein ); skein512_4way( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhashB ); -// } + } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -142,19 +908,19 @@ extern void hmq1725_4way_hash(void *state, const void *input) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); -// if ( mm256_any_set_256( vh_mask ) ) -// { + if ( mm256_anybits0( vh_mask ) ) + { blake512_4way_init( &ctx.blake ); blake512_4way( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); -// } + } -// if ( mm256_any_clr_256( vh_mask ) ) -// { + if ( mm256_anybits1( vh_mask ) ) + { bmw512_4way_init( &ctx.bmw ); bmw512_4way( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); -// } + } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -193,19 +959,19 @@ extern void hmq1725_4way_hash(void *state, const void *input) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); -// if ( mm256_any_set_256( vh_mask ) ) -// { + if ( mm256_anybits0( vh_mask ) ) + { keccak512_4way_init( &ctx.keccak ); keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); -// } + } -// if ( mm256_any_clr_256( vh_mask ) ) -// { + if ( mm256_anybits1( vh_mask ) ) + { jh512_4way_init( &ctx.jh ); jh512_4way( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); -// } + } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -224,64 +990,63 @@ extern void hmq1725_4way_hash(void *state, const void *input) sph_shavite512 ( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); -// A is whirlpool serial, B is haval parallel. - + intrlv_2x128_512( vhashA, hash0, hash1 ); + intrlv_2x128_512( vhashB, hash2, hash3 ); - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); + + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + + // 4x32 for haval + intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); + // A -// if ( hash0[0] & mask ) -// { + if ( hash0[0] & mask ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash0, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); -// } -// if ( hash1[0] & mask ) -// { + } + if ( hash1[0] & mask ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash1, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash1 ); -// } -// if ( hash2[0] & mask ) -// { + } + if ( hash2[0] & mask ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash2, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash2 ); -// } -// if ( hash3[0] & mask ) -// { + } + if ( hash3[0] & mask ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash3, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); -// } + } intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); // B - -// if ( mm256_any_clr_256( vh_mask ) ) -// { + if ( mm256_anybits1( vh_mask ) ) + { haval256_5_4way_init( &ctx.haval ); haval256_5_4way( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhashB ); - memset( &vhashB[8<<2], 0, 32<<2); -// } + haval256_5_4way_close( &ctx.haval, vhash ); + memset( &vhash[8<<2], 0, 32<<2 ); + rintrlv_4x32_4x64( vhashB, vhash, 512 ); + } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -383,19 +1148,24 @@ extern void hmq1725_4way_hash(void *state, const void *input) sph_fugue512( &ctx.fugue, hash3, 64 ); sph_fugue512_close( &ctx.fugue, hash3 ); + // In this situation serial simd seems to be faster. -// A echo, B sd both serial + intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), + m256_zero ); + if ( hash0[0] & mask ) //4 { init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *)hash0, 512 ); } + else { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, + init_sd( &ctx.sd, 512 ); + update_final_sd( &ctx.sd, (BitSequence *)hash0, (const BitSequence *)hash0, 512 ); } @@ -405,10 +1175,11 @@ extern void hmq1725_4way_hash(void *state, const void *input) update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *)hash1, 512 ); } + else { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, + init_sd( &ctx.sd, 512 ); + update_final_sd( &ctx.sd, (BitSequence *)hash1, (const BitSequence *)hash1, 512 ); } @@ -418,10 +1189,11 @@ extern void hmq1725_4way_hash(void *state, const void *input) update_final_echo( &ctx.echo, (BitSequence *)hash2, (const BitSequence *)hash2, 512 ); } + else { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, + init_sd( &ctx.sd, 512 ); + update_final_sd( &ctx.sd, (BitSequence *)hash2, (const BitSequence *)hash2, 512 ); } @@ -431,10 +1203,11 @@ extern void hmq1725_4way_hash(void *state, const void *input) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *)hash3, 512 ); } + else { - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, + init_sd( &ctx.sd, 512 ); + update_final_sd( &ctx.sd, (BitSequence *)hash3, (const BitSequence *)hash3, 512 ); } @@ -466,39 +1239,39 @@ extern void hmq1725_4way_hash(void *state, const void *input) vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ), m256_zero ); -// if ( hash0[0] & mask ) -// { + if ( hash0[0] & mask ) + { sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); -// } -// if ( hash1[0] & mask ) -// { + } + if ( hash1[0] & mask ) + { sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash1, 64 ); sph_fugue512_close( &ctx.fugue, hash1 ); -// } -// if ( hash2[0] & mask ) -// { + } + if ( hash2[0] & mask ) + { sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash2, 64 ); sph_fugue512_close( &ctx.fugue, hash2 ); -// } -// if ( hash3[0] & mask ) -// { + } + if ( hash3[0] & mask ) + { sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash3, 64 ); sph_fugue512_close( &ctx.fugue, hash3 ); -// } + } intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); -// if ( mm256_any_clr_256( vh_mask ) ) -// { + if ( mm256_anybits1( vh_mask ) ) + { sha512_4way_init( &ctx.sha512 ); sha512_4way( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhashB ); -// } + } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -525,39 +1298,43 @@ extern void hmq1725_4way_hash(void *state, const void *input) m256_zero ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - -// if ( mm256_any_set_256( vh_mask ) ) //4 -// { + + // 4x32 for haval + intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); + + if ( mm256_anybits0( vh_mask ) ) + { haval256_5_4way_init( &ctx.haval ); haval256_5_4way( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhashA ); - memset( &vhashA[8<<2], 0, 32<<2 ); -// } + haval256_5_4way_close( &ctx.haval, vhash ); + memset( &vhash[8<<2], 0, 32<<2 ); + rintrlv_4x32_4x64( vhashA, vhash, 512 ); + } -// if ( !( hash0[0] & mask ) ) -// { + if ( !( hash0[0] & mask ) ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash0, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); -// } -// if ( !( hash2[0] & mask ) ) -// { + } + if ( !( hash1[0] & mask ) ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash1, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash1 ); -// } -// if ( !( hash2[0] & mask ) ) -// { + } + if ( !( hash2[0] & mask ) ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash2, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash2 ); -// } -// if ( !( hash3[0] & mask ) ) -// { + } + if ( !( hash3[0] & mask ) ) + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash3, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); -// } + } intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 ); @@ -573,46 +1350,42 @@ extern void hmq1725_4way_hash(void *state, const void *input) int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (64))); -// uint32_t *hash7 = &(hash[25]); -// uint32_t lane_hash[8] __attribute__ ((aligned (32))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; + uint32_t hash[16*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[25]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i*)vdata + 9; // aligned + int thr_id = mythr->id; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[ m ]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - hmq1725_4way_hash( hash, vdata ); - for ( int i = 0; i < 4; i++ ) - if ( ( (hash+(i<<3))[7] & mask ) == 0 ) - { - if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark ) - { - pdata[19] = n + i; - submit_lane_solution( work, (hash+(i<<3)), mythr, i ); - } - } - n += 4; - } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; - return 0; + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + hmq1725_4way_hash( hash, vdata ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( hash7[ lane<<1 ] <= Htarg ) + { + extr_lane_4x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 4; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + return 0; } #endif // HMQ1725_4WAY diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c index a0ccf1b..9cc2784 100644 --- a/algo/quark/hmq1725-gate.c +++ b/algo/quark/hmq1725-gate.c @@ -2,7 +2,10 @@ bool register_hmq1725_algo( algo_gate_t* gate ) { -#if defined(HMQ1725_4WAY) +#if defined(HMQ1725_8WAY) + gate->scanhash = (void*)&scanhash_hmq1725_8way; + gate->hash = (void*)&hmq1725_8way_hash; +#elif defined(HMQ1725_4WAY) gate->scanhash = (void*)&scanhash_hmq1725_4way; gate->hash = (void*)&hmq1725_4way_hash; #else @@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_hmq1725; gate->hash = (void*)&hmq1725hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; opt_target_factor = 65536.0; return true; }; diff --git a/algo/quark/hmq1725-gate.h b/algo/quark/hmq1725-gate.h index 23f51f6..faef6fc 100644 --- a/algo/quark/hmq1725-gate.h +++ b/algo/quark/hmq1725-gate.h @@ -4,13 +4,21 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) -// #define HMQ1725_4WAY 1 +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define HMQ1725_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define HMQ1725_4WAY 1 #endif bool register_hmq1725_algo( algo_gate_t* gate ); -#if defined(HMQ1725_4WAY) +#if defined(HMQ1725_8WAY) + +void hmq1725_8way_hash( void *state, const void *input ); +int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(HMQ1725_4WAY) void hmq1725_4way_hash( void *state, const void *input ); int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c index 4b065ef..aaea14a 100644 --- a/algo/quark/hmq1725.c +++ b/algo/quark/hmq1725.c @@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, if (((hash64[7]&0xFFFFFFFF)==0) && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, if (((hash64[7]&0xFFFFFFF0)==0) && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, if (((hash64[7]&0xFFFFFF00)==0) && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, if (((hash64[7]&0xFFFFF000)==0) && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, if (((hash64[7]&0xFFFF0000)==0) && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); @@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce, hmq1725hash(hash64, endiandata); if (fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; + work_set_target_ratio( work, hash64 ); return true; } } while (n < max_nonce && !work_restart[thr_id].restart); diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 9c617ba..180d636 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); -// AVX 512 cmpeq returns a bit mask instead of a vector mask. -// This should simplify things but the logic doesn't seem to be working. -// The problem appears to be related to the test to skip a hash if it isn't -// to be used. Skipping the test for all 8 way hashes seems to have -// fixed it. The hash selection blending works if the hash is produced -// but the hash wasn't being produced when it should. -// Both decisions are based on the same data, the __mmask8. It works -// as a blend mask but not in a logical comparison, maybe the type is the -// problem. Maybe a cast to int or movm is needed to make it work. -// It's now moot because the hash can only be skipped 1 in 256 iterations -// when hashing parallel 8 ways. -// The performance impact of the workaround should be negligible. -// It's a problem for another day. - vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), zero ); diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c index 8aee162..2b5d603 100644 --- a/algo/qubit/qubit-2way.c +++ b/algo/qubit/qubit-2way.c @@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce, { uint32_t hash[4*8] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[19]; diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 5be93d4..2791877 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -56,7 +56,7 @@ typedef struct { __m128i val[8]; uint32_t count_high, count_low; bool initialized; -} sha256_4way_context; +} sha256_4way_context __attribute__ ((aligned (64))); void sha256_4way_init( sha256_4way_context *sc ); void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ); @@ -71,7 +71,7 @@ typedef struct { __m256i val[8]; uint32_t count_high, count_low; bool initialized; -} sha256_8way_context; +} sha256_8way_context __attribute__ ((aligned (128))); void sha256_8way_init( sha256_8way_context *sc ); void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ); @@ -86,30 +86,32 @@ typedef struct { __m256i val[8]; uint64_t count; bool initialized; -} sha512_4way_context; +} sha512_4way_context __attribute__ ((aligned (128))); void sha512_4way_init( sha512_4way_context *sc); -void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ); +void sha512_4way_update( sha512_4way_context *sc, const void *data, + size_t len ); +#define sha512_4way sha512_4way_update void sha512_4way_close( sha512_4way_context *sc, void *dst ); -// SHA-256 11 way hybrid -// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel. +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-512 8 way + typedef struct { - __m256i bufx[64>>2]; - __m256i valx[8]; - __m64 bufy[64>>2]; - __m64 valy[8]; - uint32_t bufz[64>>2]; - uint32_t valz[8]; - uint32_t count_high, count_low; -} sha256_11way_context; + __m512i buf[128>>3]; + __m512i val[8]; + uint64_t count; + bool initialized; +} sha512_8way_context __attribute__ ((aligned (128))); -void sha256_11way_init( sha256_11way_context *ctx ); -void sha256_11way_update( sha256_11way_context *ctx, const void *datax, - const void *datay, const void *dataz, size_t len ); -void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx, - void *dstz ); +void sha512_8way_init( sha512_8way_context *sc); +void sha512_8way_update( sha512_8way_context *sc, const void *data, + size_t len ); +void sha512_8way_close( sha512_8way_context *sc, void *dst ); + +#endif // AVX512 #endif // __AVX2__ #endif // __SSE2__ #endif // SHA256_4WAY_H__ diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 1c30074..3ee8194 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -36,8 +36,6 @@ #include #include "sha-hash-4way.h" -// SHA-512 4 way 64 bit - /* static const sph_u64 H512[8] = { SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), @@ -90,6 +88,236 @@ static const sph_u64 K512[80] = { SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) }; + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-512 8 way 64 bit + +#define CH8W(X, Y, Z) \ + _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) + +#define MAJ8W(X, Y, Z) \ + _mm512_or_si512( _mm512_and_si512( X, Y ), \ + _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) + +#define BSG8W_5_0(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) ) + +#define BSG8W_5_1(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) ) + +#define SSG8W_5_0(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) ) + +#define SSG8W_5_1(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) ) + +static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 ) +{ + __m512i w0a, w1a, w0b, w1b; + w0a = mm512_ror_64( w0, 1 ); + w1a = mm512_ror_64( w1,19 ); + w0b = mm512_ror_64( w0, 8 ); + w1b = mm512_ror_64( w1,61 ); + w0a = _mm512_xor_si512( w0a, w0b ); + w1a = _mm512_xor_si512( w1a, w1b ); + w0b = _mm512_srli_epi64( w0, 7 ); + w1b = _mm512_srli_epi64( w1, 6 ); + w0a = _mm512_xor_si512( w0a, w0b ); + w1a = _mm512_xor_si512( w1a, w1b ); + return _mm512_add_epi64( w0a, w1a ); +} + + +#define SSG8W_512x2_0( w0, w1, i ) do \ +{ \ + __m512i X0a, X1a, X0b, X1b; \ + X0a = mm512_ror_64( W[i-15], 1 ); \ + X1a = mm512_ror_64( W[i-14], 1 ); \ + X0b = mm512_ror_64( W[i-15], 8 ); \ + X1b = mm512_ror_64( W[i-14], 8 ); \ + X0a = _mm512_xor_si512( X0a, X0b ); \ + X1a = _mm512_xor_si512( X1a, X1b ); \ + X0b = _mm512_srli_epi64( W[i-15], 7 ); \ + X1b = _mm512_srli_epi64( W[i-14], 7 ); \ + w0 = _mm512_xor_si512( X0a, X0b ); \ + w1 = _mm512_xor_si512( X1a, X1b ); \ +} while(0) + +#define SSG8W_512x2_1( w0, w1, i ) do \ +{ \ + __m512i X0a, X1a, X0b, X1b; \ + X0a = mm512_ror_64( W[i-2],19 ); \ + X1a = mm512_ror_64( W[i-1],19 ); \ + X0b = mm512_ror_64( W[i-2],61 ); \ + X1b = mm512_ror_64( W[i-1],61 ); \ + X0a = _mm512_xor_si512( X0a, X0b ); \ + X1a = _mm512_xor_si512( X1a, X1b ); \ + X0b = _mm512_srli_epi64( W[i-2], 6 ); \ + X1b = _mm512_srli_epi64( W[i-1], 6 ); \ + w0 = _mm512_xor_si512( X0a, X0b ); \ + w1 = _mm512_xor_si512( X1a, X1b ); \ +} while(0) + +#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \ +do { \ + __m512i T1, T2; \ + __m512i K = _mm512_set1_epi64( K512[ i ] ); \ + T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \ + K, W[i] ) ); \ + T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \ + D = _mm512_add_epi64( D, T1 ); \ + H = _mm512_add_epi64( T1, T2 ); \ +} while (0) + +static void +sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] ) +{ + int i; + register __m512i A, B, C, D, E, F, G, H; + __m512i W[80]; + + mm512_block_bswap_64( W , in ); + mm512_block_bswap_64( W+8, in+8 ); + + for ( i = 16; i < 80; i++ ) + W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ), + _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) ); + + if ( ctx->initialized ) + { + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + } + else + { + A = m512_const1_64( 0x6A09E667F3BCC908 ); + B = m512_const1_64( 0xBB67AE8584CAA73B ); + C = m512_const1_64( 0x3C6EF372FE94F82B ); + D = m512_const1_64( 0xA54FF53A5F1D36F1 ); + E = m512_const1_64( 0x510E527FADE682D1 ); + F = m512_const1_64( 0x9B05688C2B3E6C1F ); + G = m512_const1_64( 0x1F83D9ABFB41BD6B ); + H = m512_const1_64( 0x5BE0CD19137E2179 ); + } + + for ( i = 0; i < 80; i += 8 ) + { + SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); + SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 ); + SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 ); + SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 ); + SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 ); + SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 ); + SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 ); + SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 ); + } + + if ( ctx->initialized ) + { + r[0] = _mm512_add_epi64( r[0], A ); + r[1] = _mm512_add_epi64( r[1], B ); + r[2] = _mm512_add_epi64( r[2], C ); + r[3] = _mm512_add_epi64( r[3], D ); + r[4] = _mm512_add_epi64( r[4], E ); + r[5] = _mm512_add_epi64( r[5], F ); + r[6] = _mm512_add_epi64( r[6], G ); + r[7] = _mm512_add_epi64( r[7], H ); + } + else + { + ctx->initialized = true; + r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) ); + r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) ); + r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) ); + r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) ); + r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) ); + r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) ); + r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) ); + r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) ); + } +} + +void sha512_8way_init( sha512_8way_context *sc ) +{ + sc->initialized = false; + sc->count = 0; +} + +void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len ) +{ + __m512i *vdata = (__m512i*)data; + size_t ptr; + const int buf_size = 128; + + ptr = (unsigned)sc->count & (buf_size - 1U); + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 ); + vdata = vdata + (clen>>3); + ptr += clen; + len -= clen; + if ( ptr == buf_size ) + { + sha512_8way_round( sc, sc->buf, sc->val ); + ptr = 0; + } + sc->count += clen; + } +} + +void sha512_8way_close( sha512_8way_context *sc, void *dst ) +{ + unsigned ptr; + const int buf_size = 128; + const int pad = buf_size - 16; + const __m512i shuff_bswap64 = m512_const_64( + 0x38393a3b3c3d3e3f, 0x3031323334353637, + 0x28292a2b2c2d2e2f, 0x2021222324252627, + 0x18191a1b1c1d1e1f, 0x1011121314151617, + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); + + ptr = (unsigned)sc->count & (buf_size - 1U); + sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 ); + ptr += 8; + if ( ptr > pad ) + { + memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 ); + sha512_8way_round( sc, sc->buf, sc->val ); + memset_zero_512( sc->buf, pad >> 3 ); + } + else + memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); + + sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8( + _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 ); + sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8( + _mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 ); + sha512_8way_round( sc, sc->buf, sc->val ); + + mm512_block_bswap_64( dst, sc->val ); +} + + +#endif // AVX512 + +// SHA-512 4 way 64 bit + + #define CH(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) @@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc ) sc->count = 0; } -void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) +void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; size_t ptr; diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index b84246b..dffa18d 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -33,7 +33,7 @@ #include #include -#ifdef __AVX2__ +#ifdef __SSE4_1__ #include "shabal-hash-4way.h" #ifdef __cplusplus @@ -58,6 +58,599 @@ extern "C"{ #define O2 9 #define O3 6 + +#if defined(__AVX2__) + +#define DECL_STATE8 \ + __m256i A00, A01, A02, A03, A04, A05, A06, A07, \ + A08, A09, A0A, A0B; \ + __m256i B0, B1, B2, B3, B4, B5, B6, B7, \ + B8, B9, BA, BB, BC, BD, BE, BF; \ + __m256i C0, C1, C2, C3, C4, C5, C6, C7, \ + C8, C9, CA, CB, CC, CD, CE, CF; \ + __m256i M0, M1, M2, M3, M4, M5, M6, M7, \ + M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 Wlow, Whigh; + +#define READ_STATE8(state) do \ +{ \ + if ( (state)->state_loaded ) \ + { \ + A00 = (state)->A[0]; \ + A01 = (state)->A[1]; \ + A02 = (state)->A[2]; \ + A03 = (state)->A[3]; \ + A04 = (state)->A[4]; \ + A05 = (state)->A[5]; \ + A06 = (state)->A[6]; \ + A07 = (state)->A[7]; \ + A08 = (state)->A[8]; \ + A09 = (state)->A[9]; \ + A0A = (state)->A[10]; \ + A0B = (state)->A[11]; \ + B0 = (state)->B[0]; \ + B1 = (state)->B[1]; \ + B2 = (state)->B[2]; \ + B3 = (state)->B[3]; \ + B4 = (state)->B[4]; \ + B5 = (state)->B[5]; \ + B6 = (state)->B[6]; \ + B7 = (state)->B[7]; \ + B8 = (state)->B[8]; \ + B9 = (state)->B[9]; \ + BA = (state)->B[10]; \ + BB = (state)->B[11]; \ + BC = (state)->B[12]; \ + BD = (state)->B[13]; \ + BE = (state)->B[14]; \ + BF = (state)->B[15]; \ + C0 = (state)->C[0]; \ + C1 = (state)->C[1]; \ + C2 = (state)->C[2]; \ + C3 = (state)->C[3]; \ + C4 = (state)->C[4]; \ + C5 = (state)->C[5]; \ + C6 = (state)->C[6]; \ + C7 = (state)->C[7]; \ + C8 = (state)->C[8]; \ + C9 = (state)->C[9]; \ + CA = (state)->C[10]; \ + CB = (state)->C[11]; \ + CC = (state)->C[12]; \ + CD = (state)->C[13]; \ + CE = (state)->C[14]; \ + CF = (state)->C[15]; \ + } \ + else \ + { \ + (state)->state_loaded = true; \ + A00 = m256_const1_64( 0x20728DFD20728DFD ); \ + A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \ + A02 = m256_const1_64( 0xE782B699E782B699 ); \ + A03 = m256_const1_64( 0x5530463255304632 ); \ + A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \ + A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \ + A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \ + A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \ + A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A09 = m256_const1_64( 0x8BD144108BD14410 ); \ + A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \ + A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \ + B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \ + B1 = m256_const1_64( 0x07B385F307B385F3 ); \ + B2 = m256_const1_64( 0xE7442C26E7442C26 ); \ + B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \ + B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \ + B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \ + B6 = m256_const1_64( 0x73B9D31473B9D314 ); \ + B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \ + B8 = m256_const1_64( 0x48910A5A48910A5A ); \ + B9 = m256_const1_64( 0x893B22DB893B22DB ); \ + BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \ + BB = m256_const1_64( 0xBBC4324EBBC4324E ); \ + BC = m256_const1_64( 0x72D2F24072D2F240 ); \ + BD = m256_const1_64( 0x75941D9975941D99 ); \ + BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \ + BF = m256_const1_64( 0xA1A7502BA1A7502B ); \ + C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \ + C1 = m256_const1_64( 0x58BAD75058BAD750 ); \ + C2 = m256_const1_64( 0x56028CB256028CB2 ); \ + C3 = m256_const1_64( 0x8134F3598134F359 ); \ + C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \ + C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \ + C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \ + C7 = m256_const1_64( 0x0405278004052780 ); \ + C8 = m256_const1_64( 0x7F07D7877F07D787 ); \ + C9 = m256_const1_64( 0x5194358F5194358F ); \ + CA = m256_const1_64( 0x3C60D6653C60D665 ); \ + CB = m256_const1_64( 0xBE97D79ABE97D79A ); \ + CC = m256_const1_64( 0x950C3434950C3434 ); \ + CD = m256_const1_64( 0xAED9A06DAED9A06D ); \ + CE = m256_const1_64( 0x2537DC8D2537DC8D ); \ + CF = m256_const1_64( 0x7CDB59697CDB5969 ); \ + } \ + Wlow = (state)->Wlow; \ + Whigh = (state)->Whigh; \ +} while (0) + +#define WRITE_STATE8(state) do { \ + (state)->A[0] = A00; \ + (state)->A[1] = A01; \ + (state)->A[2] = A02; \ + (state)->A[3] = A03; \ + (state)->A[4] = A04; \ + (state)->A[5] = A05; \ + (state)->A[6] = A06; \ + (state)->A[7] = A07; \ + (state)->A[8] = A08; \ + (state)->A[9] = A09; \ + (state)->A[10] = A0A; \ + (state)->A[11] = A0B; \ + (state)->B[0] = B0; \ + (state)->B[1] = B1; \ + (state)->B[2] = B2; \ + (state)->B[3] = B3; \ + (state)->B[4] = B4; \ + (state)->B[5] = B5; \ + (state)->B[6] = B6; \ + (state)->B[7] = B7; \ + (state)->B[8] = B8; \ + (state)->B[9] = B9; \ + (state)->B[10] = BA; \ + (state)->B[11] = BB; \ + (state)->B[12] = BC; \ + (state)->B[13] = BD; \ + (state)->B[14] = BE; \ + (state)->B[15] = BF; \ + (state)->C[0] = C0; \ + (state)->C[1] = C1; \ + (state)->C[2] = C2; \ + (state)->C[3] = C3; \ + (state)->C[4] = C4; \ + (state)->C[5] = C5; \ + (state)->C[6] = C6; \ + (state)->C[7] = C7; \ + (state)->C[8] = C8; \ + (state)->C[9] = C9; \ + (state)->C[10] = CA; \ + (state)->C[11] = CB; \ + (state)->C[12] = CC; \ + (state)->C[13] = CD; \ + (state)->C[14] = CE; \ + (state)->C[15] = CF; \ + (state)->Wlow = Wlow; \ + (state)->Whigh = Whigh; \ + } while (0) + +#define DECODE_BLOCK8 \ +do { \ + M0 = buf[ 0]; \ + M1 = buf[ 1]; \ + M2 = buf[ 2]; \ + M3 = buf[ 3]; \ + M4 = buf[ 4]; \ + M5 = buf[ 5]; \ + M6 = buf[ 6]; \ + M7 = buf[ 7]; \ + M8 = buf[ 8]; \ + M9 = buf[ 9]; \ + MA = buf[10]; \ + MB = buf[11]; \ + MC = buf[12]; \ + MD = buf[13]; \ + ME = buf[14]; \ + MF = buf[15]; \ +} while (0) + +#define INPUT_BLOCK_ADD8 \ +do { \ + B0 = _mm256_add_epi32( B0, M0 );\ + B1 = _mm256_add_epi32( B1, M1 );\ + B2 = _mm256_add_epi32( B2, M2 );\ + B3 = _mm256_add_epi32( B3, M3 );\ + B4 = _mm256_add_epi32( B4, M4 );\ + B5 = _mm256_add_epi32( B5, M5 );\ + B6 = _mm256_add_epi32( B6, M6 );\ + B7 = _mm256_add_epi32( B7, M7 );\ + B8 = _mm256_add_epi32( B8, M8 );\ + B9 = _mm256_add_epi32( B9, M9 );\ + BA = _mm256_add_epi32( BA, MA );\ + BB = _mm256_add_epi32( BB, MB );\ + BC = _mm256_add_epi32( BC, MC );\ + BD = _mm256_add_epi32( BD, MD );\ + BE = _mm256_add_epi32( BE, ME );\ + BF = _mm256_add_epi32( BF, MF );\ +} while (0) + +#define INPUT_BLOCK_SUB8 \ +do { \ + C0 = _mm256_sub_epi32( C0, M0 ); \ + C1 = _mm256_sub_epi32( C1, M1 ); \ + C2 = _mm256_sub_epi32( C2, M2 ); \ + C3 = _mm256_sub_epi32( C3, M3 ); \ + C4 = _mm256_sub_epi32( C4, M4 ); \ + C5 = _mm256_sub_epi32( C5, M5 ); \ + C6 = _mm256_sub_epi32( C6, M6 ); \ + C7 = _mm256_sub_epi32( C7, M7 ); \ + C8 = _mm256_sub_epi32( C8, M8 ); \ + C9 = _mm256_sub_epi32( C9, M9 ); \ + CA = _mm256_sub_epi32( CA, MA ); \ + CB = _mm256_sub_epi32( CB, MB ); \ + CC = _mm256_sub_epi32( CC, MC ); \ + CD = _mm256_sub_epi32( CD, MD ); \ + CE = _mm256_sub_epi32( CE, ME ); \ + CF = _mm256_sub_epi32( CF, MF ); \ +} while (0) + +#define XOR_W8 \ +do { \ + A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \ + A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \ +} while (0) + +#define SWAP_BC8 \ +do { \ + mm256_swap512_256( B0, C0 ); \ + mm256_swap512_256( B1, C1 ); \ + mm256_swap512_256( B2, C2 ); \ + mm256_swap512_256( B3, C3 ); \ + mm256_swap512_256( B4, C4 ); \ + mm256_swap512_256( B5, C5 ); \ + mm256_swap512_256( B6, C6 ); \ + mm256_swap512_256( B7, C7 ); \ + mm256_swap512_256( B8, C8 ); \ + mm256_swap512_256( B9, C9 ); \ + mm256_swap512_256( BA, CA ); \ + mm256_swap512_256( BB, CB ); \ + mm256_swap512_256( BC, CC ); \ + mm256_swap512_256( BD, CD ); \ + mm256_swap512_256( BE, CE ); \ + mm256_swap512_256( BF, CF ); \ +} while (0) + +#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ +do { \ + xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \ + _mm256_andnot_si256( xb3, xb2 ), \ + _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \ + _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \ + ) ), _mm256_set1_epi32(3UL) ) ) ) ); \ + xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \ +} while (0) + +#define PERM_STEP_0_8 do { \ + PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_1_8 do { \ + PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_2_8 do { \ + PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \ + PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \ + PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \ + PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \ + PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \ + PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \ + PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \ + PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \ + PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \ + PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \ + PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \ + PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \ + PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \ + PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \ + PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \ + PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define APPLY_P8 \ +do { \ + B0 = mm256_ror_32( B0, 15 ); \ + B1 = mm256_ror_32( B1, 15 ); \ + B2 = mm256_ror_32( B2, 15 ); \ + B3 = mm256_ror_32( B3, 15 ); \ + B4 = mm256_ror_32( B4, 15 ); \ + B5 = mm256_ror_32( B5, 15 ); \ + B6 = mm256_ror_32( B6, 15 ); \ + B7 = mm256_ror_32( B7, 15 ); \ + B8 = mm256_ror_32( B8, 15 ); \ + B9 = mm256_ror_32( B9, 15 ); \ + BA = mm256_ror_32( BA, 15 ); \ + BB = mm256_ror_32( BB, 15 ); \ + BC = mm256_ror_32( BC, 15 ); \ + BD = mm256_ror_32( BD, 15 ); \ + BE = mm256_ror_32( BE, 15 ); \ + BF = mm256_ror_32( BF, 15 ); \ + PERM_STEP_0_8; \ + PERM_STEP_1_8; \ + PERM_STEP_2_8; \ + A0B = _mm256_add_epi32( A0B, C6 ); \ + A0A = _mm256_add_epi32( A0A, C5 ); \ + A09 = _mm256_add_epi32( A09, C4 ); \ + A08 = _mm256_add_epi32( A08, C3 ); \ + A07 = _mm256_add_epi32( A07, C2 ); \ + A06 = _mm256_add_epi32( A06, C1 ); \ + A05 = _mm256_add_epi32( A05, C0 ); \ + A04 = _mm256_add_epi32( A04, CF ); \ + A03 = _mm256_add_epi32( A03, CE ); \ + A02 = _mm256_add_epi32( A02, CD ); \ + A01 = _mm256_add_epi32( A01, CC ); \ + A00 = _mm256_add_epi32( A00, CB ); \ + A0B = _mm256_add_epi32( A0B, CA ); \ + A0A = _mm256_add_epi32( A0A, C9 ); \ + A09 = _mm256_add_epi32( A09, C8 ); \ + A08 = _mm256_add_epi32( A08, C7 ); \ + A07 = _mm256_add_epi32( A07, C6 ); \ + A06 = _mm256_add_epi32( A06, C5 ); \ + A05 = _mm256_add_epi32( A05, C4 ); \ + A04 = _mm256_add_epi32( A04, C3 ); \ + A03 = _mm256_add_epi32( A03, C2 ); \ + A02 = _mm256_add_epi32( A02, C1 ); \ + A01 = _mm256_add_epi32( A01, C0 ); \ + A00 = _mm256_add_epi32( A00, CF ); \ + A0B = _mm256_add_epi32( A0B, CE ); \ + A0A = _mm256_add_epi32( A0A, CD ); \ + A09 = _mm256_add_epi32( A09, CC ); \ + A08 = _mm256_add_epi32( A08, CB ); \ + A07 = _mm256_add_epi32( A07, CA ); \ + A06 = _mm256_add_epi32( A06, C9 ); \ + A05 = _mm256_add_epi32( A05, C8 ); \ + A04 = _mm256_add_epi32( A04, C7 ); \ + A03 = _mm256_add_epi32( A03, C6 ); \ + A02 = _mm256_add_epi32( A02, C5 ); \ + A01 = _mm256_add_epi32( A01, C4 ); \ + A00 = _mm256_add_epi32( A00, C3 ); \ +} while (0) + +#define INCR_W8 do { \ + if ((Wlow = T32(Wlow + 1)) == 0) \ + Whigh = T32(Whigh + 1); \ + } while (0) + +static void +shabal_8way_init( void *cc, unsigned size ) +{ + shabal_8way_context *sc = (shabal_8way_context*)cc; + + if ( size == 512 ) + { // copy immediate constants directly to working registers later. + sc->state_loaded = false; + } + else + { // No users + sc->state_loaded = true; + sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 ); + sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 ); + sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC ); + sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 ); + sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 ); + sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 ); + sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA ); + sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C ); + sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 ); + sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC ); + sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B ); + sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A ); + + sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE ); + sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 ); + sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F ); + sc->B[ 3] = m256_const1_64( 0x9301515F9301515F ); + sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA ); + sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 ); + sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 ); + sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 ); + sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 ); + sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 ); + sc->B[10] = m256_const1_64( 0xBE216306BE216306 ); + sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 ); + sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B ); + sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 ); + sc->B[14] = m256_const1_64( 0x30924DD430924DD4 ); + sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 ); + + sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 ); + sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA ); + sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 ); + sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 ); + sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE ); + sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 ); + sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 ); + sc->C[ 7] = m256_const1_64( 0xED614433ED614433 ); + sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 ); + sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA ); + sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B ); + sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F ); + sc->C[12] = m256_const1_64( 0xBC968828BC968828 ); + sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 ); + sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 ); + sc->C[15] = m256_const1_64( 0x9B491C609B491C60 ); + } + sc->Wlow = 1; + sc->Whigh = 0; + sc->ptr = 0; +} + +static void +shabal_8way_core( void *cc, const unsigned char *data, size_t len ) +{ + shabal_8way_context *sc = (shabal_8way_context*)cc; + __m256i *buf; + __m256i *vdata = (__m256i*)data; + const int buf_size = 64; + size_t ptr; + DECL_STATE8 + + buf = sc->buf; + ptr = sc->ptr; + + if ( len < (buf_size - ptr ) ) + { + memcpy_256( buf + (ptr>>2), vdata, len>>2 ); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE8( sc ); + + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_256( buf + (ptr>>2), vdata, clen>>2 ); + + ptr += clen; + vdata += clen>>2; + len -= clen; + if ( ptr == buf_size ) + { + DECODE_BLOCK8; + INPUT_BLOCK_ADD8; + XOR_W8; + APPLY_P8; + INPUT_BLOCK_SUB8; + SWAP_BC8; + INCR_W8; + ptr = 0; + } + } + WRITE_STATE8(sc); + sc->ptr = ptr; +} + +static void +shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst, + unsigned size_words ) +{ + shabal_8way_context *sc = (shabal_8way_context*)cc; + __m256i *buf; + const int buf_size = 64; + size_t ptr; + int i; + unsigned z, zz; + DECL_STATE8 + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + zz = ((ub & -z) | z) & 0xFF; + buf[ptr>>2] = _mm256_set1_epi32( zz ); + memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 ); + READ_STATE8(sc); + DECODE_BLOCK8; + INPUT_BLOCK_ADD8; + XOR_W8; + APPLY_P8; + + for ( i = 0; i < 3; i ++ ) + { + SWAP_BC8; + XOR_W8; + APPLY_P8; + } + + __m256i *d = (__m256i*)dst; + if ( size_words == 16 ) // 512 + { + d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3; + d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7; + d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB; + d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF; + } + else // 256 + { + d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB; + d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF; + } +} + +void +shabal256_8way_init( void *cc ) +{ + shabal_8way_init(cc, 256); +} + +void +shabal256_8way_update( void *cc, const void *data, size_t len ) +{ + shabal_8way_core( cc, data, len ); +} + +void +shabal256_8way_close( void *cc, void *dst ) +{ + shabal_8way_close(cc, 0, 0, dst, 8); +} + +void +shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, + void *dst ) +{ + shabal_8way_close(cc, ub, n, dst, 8); +} + +void +shabal512_8way_init(void *cc) +{ + shabal_8way_init(cc, 512); +} + +void +shabal512_8way_update(void *cc, const void *data, size_t len) +{ + shabal_8way_core(cc, data, len); +} + +void +shabal512_8way_close(void *cc, void *dst) +{ + shabal_8way_close(cc, 0, 0, dst, 16); +} + +void +shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shabal_8way_close(cc, ub, n, dst, 16); +} + + +#endif // AVX2 + /* * We copy the state into local variables, so that the compiler knows * that it can optimize them at will. @@ -290,6 +883,8 @@ do { \ A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \ A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \ } while (0) + + /* #define SWAP(v1, v2) do { \ sph_u32 tmp = (v1); \ @@ -297,26 +892,39 @@ do { \ (v2) = tmp; \ } while (0) */ + #define SWAP_BC \ do { \ - mm128_swap128_256( B0, C0 ); \ - mm128_swap128_256( B1, C1 ); \ - mm128_swap128_256( B2, C2 ); \ - mm128_swap128_256( B3, C3 ); \ - mm128_swap128_256( B4, C4 ); \ - mm128_swap128_256( B5, C5 ); \ - mm128_swap128_256( B6, C6 ); \ - mm128_swap128_256( B7, C7 ); \ - mm128_swap128_256( B8, C8 ); \ - mm128_swap128_256( B9, C9 ); \ - mm128_swap128_256( BA, CA ); \ - mm128_swap128_256( BB, CB ); \ - mm128_swap128_256( BC, CC ); \ - mm128_swap128_256( BD, CD ); \ - mm128_swap128_256( BE, CE ); \ - mm128_swap128_256( BF, CF ); \ + mm128_swap256_128( B0, C0 ); \ + mm128_swap256_128( B1, C1 ); \ + mm128_swap256_128( B2, C2 ); \ + mm128_swap256_128( B3, C3 ); \ + mm128_swap256_128( B4, C4 ); \ + mm128_swap256_128( B5, C5 ); \ + mm128_swap256_128( B6, C6 ); \ + mm128_swap256_128( B7, C7 ); \ + mm128_swap256_128( B8, C8 ); \ + mm128_swap256_128( B9, C9 ); \ + mm128_swap256_128( BA, CA ); \ + mm128_swap256_128( BB, CB ); \ + mm128_swap256_128( BC, CC ); \ + mm128_swap256_128( BD, CD ); \ + mm128_swap256_128( BE, CE ); \ + mm128_swap256_128( BF, CF ); \ } while (0) +/* +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ +do { \ + __m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\ + _mm_set1_epi32(5UL) ) \ + __m128i t2 = _mm_xor_si128( xa0, xc ); \ + xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \ + xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \ + _mm_xor_si128( t2, \ + _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \ +*/ + #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ do { \ xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ @@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc ) } void -shabal256_4way( void *cc, const void *data, size_t len ) +shabal256_4way_update( void *cc, const void *data, size_t len ) { shabal_4way_core( cc, data, len ); } @@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc) } void -shabal512_4way(void *cc, const void *data, size_t len) +shabal512_4way_update(void *cc, const void *data, size_t len) { shabal_4way_core(cc, data, len); } diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index bf54b59..c296f8c 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -36,7 +36,7 @@ #ifndef SHABAL_HASH_4WAY_H__ #define SHABAL_HASH_4WAY_H__ 1 -#ifdef __AVX2__ +#ifdef __SSE4_1__ #include #include "algo/sha/sph_types.h" @@ -50,6 +50,34 @@ extern "C"{ #define SPH_SIZE_shabal512 512 +#if defined(__AVX2__) + +typedef struct { + __m256i buf[16]; + __m256i A[12], B[16], C[16]; + sph_u32 Whigh, Wlow; + size_t ptr; + bool state_loaded; +} shabal_8way_context __attribute__ ((aligned (64))); + +typedef shabal_8way_context shabal256_8way_context; +typedef shabal_8way_context shabal512_8way_context; + +void shabal256_8way_init( void *cc ); +void shabal256_8way_update( void *cc, const void *data, size_t len ); +void shabal256_8way_close( void *cc, void *dst ); +void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, + void *dst ); + +void shabal512_8way_init( void *cc ); +void shabal512_8way_update( void *cc, const void *data, size_t len ); +void shabal512_8way_close( void *cc, void *dst ); +void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, + void *dst ); + + +#endif + typedef struct { __m128i buf[16] __attribute__ ((aligned (64))); __m128i A[12], B[16], C[16]; @@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context; typedef shabal_4way_context shabal512_4way_context; void shabal256_4way_init( void *cc ); -void shabal256_4way( void *cc, const void *data, size_t len ); +void shabal256_4way_update( void *cc, const void *data, size_t len ); void shabal256_4way_close( void *cc, void *dst ); void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); void shabal512_4way_init( void *cc ); -void shabal512_4way( void *cc, const void *data, size_t len ); +void shabal512_4way_update( void *cc, const void *data, size_t len ); +#define shabal512_4way shabal512_4way_update void shabal512_4way_close( void *cc, void *dst ); void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index d061ef0..25fe8a6 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -3,6 +3,12 @@ #include +// This implementation is deprecated, superseded by VAES in Icelake +// which provides HW based 4 way aes. +// It was created for AVX2 to eliminate interleaving between the +// preceding and following function. +// This code can be removed when current users have reverted to one way. + #if defined(__AVX2__) @@ -16,8 +22,8 @@ static const uint32_t IV512[] = #define mm256_ror2x256hi_1x32( a, b ) \ - _mm256_blend_epi32( mm256_ror1x32_128( a ), \ - mm256_ror1x32_128( b ), 0x88 ) + _mm256_blend_epi32( mm256_ror128_32( a ), \ + mm256_ror128_32( b ), 0x88 ) static void c512_2way( shavite512_2way_context *ctx, const void *msg ) @@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) { // round 1, 5, 9 - k00 = _mm256_xor_si256( k13, mm256_ror1x32_128( + k00 = _mm256_xor_si256( k13, mm256_ror128_32( mm256_aesenc_2x128( k00, zero ) ) ); if ( r == 0 ) @@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); k01 = _mm256_xor_si256( k00, - mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) ); if ( r == 1 ) k01 = _mm256_xor_si256( k01, _mm256_set_epi32( @@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); k02 = _mm256_xor_si256( k01, - mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); k03 = _mm256_xor_si256( k02, - mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); k10 = _mm256_xor_si256( k03, - mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); k11 = _mm256_xor_si256( k10, - mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); k12 = _mm256_xor_si256( k11, - mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); k13 = _mm256_xor_si256( k12, - mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) ); + mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) ); if ( r == 2 ) k13 = _mm256_xor_si256( k13, _mm256_set_epi32( @@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 3, 7, 11 - k00 = _mm256_xor_si256( mm256_ror1x32_128( + k00 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror1x32_128( + k01 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror1x32_128( + k02 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror1x32_128( + k03 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p1 = _mm256_xor_si256( p1, x ); - k10 = _mm256_xor_si256( mm256_ror1x32_128( + k10 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror1x32_128( + k11 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = _mm256_xor_si256( mm256_ror1x32_128( + k12 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ), k11 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror1x32_128( + k13 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); @@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) // round 13 - k00 = _mm256_xor_si256( mm256_ror1x32_128( + k00 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k00, zero ) ), k13 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero ); - k01 = _mm256_xor_si256( mm256_ror1x32_128( + k01 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ), k00 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero ); - k02 = _mm256_xor_si256( mm256_ror1x32_128( + k02 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ), k01 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero ); - k03 = _mm256_xor_si256( mm256_ror1x32_128( + k03 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ), k02 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero ); p3 = _mm256_xor_si256( p3, x ); - k10 = _mm256_xor_si256( mm256_ror1x32_128( + k10 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ), k03 ); x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero ); - k11 = _mm256_xor_si256( mm256_ror1x32_128( + k11 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ), k10 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero ); - k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ); + k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ); k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1, ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero ); - k13 = _mm256_xor_si256( mm256_ror1x32_128( + k13 = _mm256_xor_si256( mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ), k12 ); x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero ); diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c index 1bd1664..fcae00c 100644 --- a/algo/x11/c11-4way.c +++ b/algo/x11/c11-4way.c @@ -51,6 +51,8 @@ void init_c11_8way_ctx() void c11_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input ) skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); - // Serial - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); - // 7 Luffa + 8 cube - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); // 9 Shavite sph_shavite512( &ctx.shavite, hash0, 64 ); diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index ad3168d..a30cbc0 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -51,6 +51,8 @@ void init_x11_8way_ctx() void x11_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input ) keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); - // Luffa + Cube - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c index 90ed730..ed4d131 100644 --- a/algo/x12/x12-4way.c +++ b/algo/x12/x12-4way.c @@ -1,7 +1,4 @@ #include "x12-gate.h" - -#if defined(X12_4WAY) - #include #include #include @@ -14,11 +11,223 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" -//#include "algo/fugue/sph_fugue.h" + +#if defined(X12_8WAY) + + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; +} x12_8way_ctx_holder; + +x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64))); + +void init_x12_8way_ctx() +{ + blake512_8way_init( &x12_8way_ctx.blake ); + bmw512_8way_init( &x12_8way_ctx.bmw ); + init_groestl( &x12_8way_ctx.groestl, 64 ); + skein512_8way_init( &x12_8way_ctx.skein ); + jh512_8way_init( &x12_8way_ctx.jh ); + keccak512_8way_init( &x12_8way_ctx.keccak ); + luffa_4way_init( &x12_8way_ctx.luffa, 512 ); + cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x12_8way_ctx.shavite ); + simd_4way_init( &x12_8way_ctx.simd, 512 ); + init_echo( &x12_8way_ctx.echo, 512 ); + hamsi512_8way_init( &x12_8way_ctx.hamsi ); +}; + +void x12_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + + x12_8way_ctx_holder ctx; + memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + memcpy( &ctx.shavite, &x12_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + memcpy( &ctx.groestl, &x12_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, state ); +} + +int scanhash_x12_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[49]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + do { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + + x12_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane<<1 ] < Htarg ) + { + extr_lane_8x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(X12_4WAY) typedef struct { blake512_4way_context blake; @@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input ) x12_4way_ctx_holder ctx; memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) ); - // 1 Blake blake512_4way( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); - // 2 Bmw bmw512_4way( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); - - // Serial dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - // 3 Groestl - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - - // Parallel 4way 64 bit - intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - - // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); - skein512_4way_close( &ctx.skein, vhash ); - - // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); - - // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); - - // Serial - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - - // 7 Luffa intrlv_2x128( vhash, hash0, hash1, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); dintrlv_2x128( hash0, hash1, vhash, 512 ); @@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input ) luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); dintrlv_2x128( hash2, hash3, vhash, 512 ); - // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); @@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input ) cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 ); - // 9 Shavite sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); memcpy( &ctx.shavite, &x12_4way_ctx.shavite, @@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - // 10 Simd intrlv_2x128( vhash, hash0, hash1, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); dintrlv_2x128( hash0, hash1, vhash, 512 ); @@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input ) simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); dintrlv_2x128( hash2, hash3, vhash, 512 ); - // 11 Echo - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - // 12 Hamsi parallel 4way 32 bit + // Parallel 4way 64 bit intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhash ); + + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); diff --git a/algo/x12/x12-gate.c b/algo/x12/x12-gate.c index 05f7173..f495747 100644 --- a/algo/x12/x12-gate.c +++ b/algo/x12/x12-gate.c @@ -2,7 +2,11 @@ bool register_x12_algo( algo_gate_t* gate ) { -#if defined (X12_4WAY) +#if defined (X12_8WAY) + init_x12_8way_ctx(); + gate->scanhash = (void*)&scanhash_x12_8way; + gate->hash = (void*)&x12_8way_hash; +#elif defined (X12_4WAY) init_x12_4way_ctx(); gate->scanhash = (void*)&scanhash_x12_4way; gate->hash = (void*)&x12_4way_hash; @@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x12; gate->hash = (void*)&x12hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x12/x12-gate.h b/algo/x12/x12-gate.h index e26956e..998f09b 100644 --- a/algo/x12/x12-gate.h +++ b/algo/x12/x12-gate.h @@ -4,29 +4,36 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define X12_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X12_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X12_4WAY 1 #endif bool register_x12_algo( algo_gate_t* gate ); -#if defined(X12_4WAY) +#if defined(X12_8WAY) + +void x12_8way_hash( void *state, const void *input ); +int scanhash_x12_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x12_8way_ctx(); + +#elif defined(X12_4WAY) void x12_4way_hash( void *state, const void *input ); - int scanhash_x12_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_x12_4way_ctx(); -#endif +#else void x12hash( void *state, const void *input ); - int scanhash_x12( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_x12_ctx(); #endif +#endif + diff --git a/algo/x12/x12.c b/algo/x12/x12.c index 87a4fa6..1545ca4 100644 --- a/algo/x12/x12.c +++ b/algo/x12/x12.c @@ -20,35 +20,40 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" #if defined(__AES__) #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif typedef struct { + sph_blake512_context blake; + sph_bmw512_context bmw; + sph_skein512_context skein; + sph_jh512_context jh; + sph_keccak512_context keccak; #if defined(__AES__) - hashState_groestl groestl; - hashState_echo echo; + hashState_groestl groestl; + hashState_echo echo; #else - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; + hashState_luffa luffa; + cubehashParam cubehash; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; } x12_ctx_holder; x12_ctx_holder x12_ctx; void init_x12_ctx() { + sph_blake512_init( &x12_ctx.blake ); + sph_bmw512_init( &x12_ctx.bmw ); + sph_skein512_init( &x12_ctx.skein); + sph_jh512_init( &x12_ctx.jh); + sph_keccak512_init( &x12_ctx.keccak); #if defined(__AES__) init_echo( &x12_ctx.echo, 512 ); init_groestl (&x12_ctx.groestl, 64 ); @@ -65,102 +70,59 @@ void init_x12_ctx() void x12hash(void *output, const void *input) { + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashB hash+64 - x12_ctx_holder ctx; - memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) ); + x12_ctx_holder ctx; + memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) ); - // X11 algos + sph_blake512(&ctx.blake, input, 80); + sph_blake512_close(&ctx.blake, hash); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512(&ctx.bmw, hash, 64); + sph_bmw512_close(&ctx.bmw, hash); - //---blake1--- + update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, + (const BitSequence*)hash, 64 ); - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; + cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, + (const byte*)hashB, 64 ); - //---bmw2--- + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hashB); - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groetl---- + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hashB, 512 ); #if defined(__AES__) - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); -#else - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); -#endif - - //---skein4--- - - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- - -#if defined(__AES__) - update_final_echo ( &ctx.echo, (BitSequence *)hashB, + update_final_echo ( &ctx.echo, (BitSequence *)hashB, (const BitSequence *)hash, 512 ); #else - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hashB); #endif - // 12 Hamsi +#if defined(__AES__) + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); +#else + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); +#endif + + sph_skein512(&ctx.skein, hash, 64); + sph_skein512_close(&ctx.skein, hash); + + sph_jh512(&ctx.jh, hash, 64); + sph_jh512_close(&ctx.jh, hash); + + sph_keccak512(&ctx.keccak, hash, 64); + sph_keccak512_close(&ctx.keccak, hash); + sph_hamsi512(&ctx.hamsi, hashB, 64); sph_hamsi512_close(&ctx.hamsi, hash); - asm volatile ("emms"); memcpy(output, hashB, 32); } diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c index 970f8e3..40b4b5b 100644 --- a/algo/x13/x13-4way.c +++ b/algo/x13/x13-4way.c @@ -1,7 +1,4 @@ #include "x13-gate.h" - -#if defined(X13_4WAY) - #include #include #include @@ -14,12 +11,270 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" +#if defined(X13_8WAY) + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; +} x13_8way_ctx_holder; + +x13_8way_ctx_holder x13_8way_ctx; + +void init_x13_8way_ctx() +{ + blake512_8way_init( &x13_8way_ctx.blake ); + bmw512_8way_init( &x13_8way_ctx.bmw ); + init_groestl( &x13_8way_ctx.groestl, 64 ); + skein512_8way_init( &x13_8way_ctx.skein ); + jh512_8way_init( &x13_8way_ctx.jh ); + keccak512_8way_init( &x13_8way_ctx.keccak ); + luffa_4way_init( &x13_8way_ctx.luffa, 512 ); + cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x13_8way_ctx.shavite ); + simd_4way_init( &x13_8way_ctx.simd, 512 ); + init_echo( &x13_8way_ctx.echo, 512 ); + hamsi512_8way_init( &x13_8way_ctx.hamsi ); + sph_fugue512_init( &x13_8way_ctx.fugue ); +} + +void x13_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + + x13_8way_ctx_holder ctx; + memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + memcpy( &ctx.groestl, &x13_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + memcpy( &ctx.shavite, &x13_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 13 Fugue serial + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + memcpy( state, hash0, 32 ); + memcpy( state+ 32, hash1, 32 ); + memcpy( state+ 64, hash2, 32 ); + memcpy( state+ 96, hash3, 32 ); + memcpy( state+128, hash4, 32 ); + memcpy( state+160, hash5, 32 ); + memcpy( state+192, hash6, 32 ); + memcpy( state+224, hash7, 32 ); +} + + +int scanhash_x13_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + int thr_id = mythr->id; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const uint32_t last_nonce = max_nonce -8; + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x13_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( ( hash+(i<<3) )[7] < Htarg + && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} + + +#elif defined(X13_4WAY) + + typedef struct { blake512_4way_context blake; bmw512_4way_context bmw; diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c index 60973d3..366185c 100644 --- a/algo/x13/x13-gate.c +++ b/algo/x13/x13-gate.c @@ -2,7 +2,11 @@ bool register_x13_algo( algo_gate_t* gate ) { -#if defined (X13_4WAY) +#if defined (X13_8WAY) + init_x13_8way_ctx(); + gate->scanhash = (void*)&scanhash_x13_8way; + gate->hash = (void*)&x13_8way_hash; +#elif defined (X13_4WAY) init_x13_4way_ctx(); gate->scanhash = (void*)&scanhash_x13_4way; gate->hash = (void*)&x13_4way_hash; @@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x13; gate->hash = (void*)&x13hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h index c61d7d6..6718eb3 100644 --- a/algo/x13/x13-gate.h +++ b/algo/x13/x13-gate.h @@ -4,29 +4,35 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define X13_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X13_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X13_4WAY 1 #endif bool register_x13_algo( algo_gate_t* gate ); -#if defined(X13_4WAY) +#if defined(X13_8WAY) + +void x13_8way_hash( void *state, const void *input ); +int scanhash_x13_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x13_8way_ctx(); + +#elif defined(X13_4WAY) void x13_4way_hash( void *state, const void *input ); - int scanhash_x13_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_x13_4way_ctx(); -#endif +#else void x13hash( void *state, const void *input ); - int scanhash_x13( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_x13_ctx(); #endif +#endif diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 5267d78..9de05d3 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -1,7 +1,4 @@ #include "x14-gate.h" - -#if defined(X14_4WAY) - #include #include #include @@ -13,6 +10,7 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" @@ -22,6 +20,263 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" +#if defined(X14_8WAY) + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; +} x14_8way_ctx_holder; + +x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64))); + +void init_x14_8way_ctx() +{ + blake512_8way_init( &x14_8way_ctx.blake ); + bmw512_8way_init( &x14_8way_ctx.bmw ); + init_groestl( &x14_8way_ctx.groestl, 64 ); + skein512_8way_init( &x14_8way_ctx.skein ); + jh512_8way_init( &x14_8way_ctx.jh ); + keccak512_8way_init( &x14_8way_ctx.keccak ); + luffa_4way_init( &x14_8way_ctx.luffa, 512 ); + cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x14_8way_ctx.shavite ); + simd_4way_init( &x14_8way_ctx.simd, 512 ); + init_echo( &x14_8way_ctx.echo, 512 ); + hamsi512_8way_init( &x14_8way_ctx.hamsi ); + sph_fugue512_init( &x14_8way_ctx.fugue ); + shabal512_8way_init( &x14_8way_ctx.shabal ); +}; + +void x14_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + + x14_8way_ctx_holder ctx; + memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + memcpy( &ctx.groestl, &x14_8way_ctx.groestl, + sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + memcpy( &ctx.shavite, &x14_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 13 Fugue serial + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + // 14 Shabal, parallel 32 bit + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, state ); +} + + +int scanhash_x14_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (64))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x14_8way_hash( hash, vdata ); + pdata[19] = n; + + uint32_t *hash7 = &(hash[7<<3]); + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane ] < Htarg ) + { + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(X14_4WAY) + typedef struct { blake512_4way_context blake; bmw512_4way_context bmw; @@ -61,11 +316,11 @@ void init_x14_4way_ctx() void x14_4way_hash( void *state, const void *input ) { + uint64_t vhash[8*4] __attribute__ ((aligned (128))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); x14_4way_ctx_holder ctx; memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) ); @@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input ) // 14 Shabal, parallel 32 bit intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, state ); } int scanhash_x14_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*16] __attribute__ ((aligned (64))); + uint32_t hash[4*16] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 4; __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - + int thr_id = mythr->id; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x14_4way_hash( hash, vdata ); + pdata[19] = n; + + uint32_t *hash7 = &(hash[7<<2]); + for ( int lane = 0; lane < 4; lane++ ) + if ( hash7[ lane ] < Htarg ) { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + uint32_t lane_hash[8]; + extr_lane_4x32( lane_hash, hash, lane, 256 ); - x14_4way_hash( hash, vdata ); - pdata[19] = n; - - uint32_t *hash7 = &(hash[7<<2]); - - for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash7[ lane ] & mask ) == 0 ) - { - // deinterleave hash for lane - uint32_t lane_hash[8]; - extr_lane_4x32( lane_hash, hash, lane, 256 ); - - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, lane_hash, mythr, lane ); - } - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - *hashes_done = n - first_nonce + 1; + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 4; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c index 013aa10..851b7c3 100644 --- a/algo/x14/x14-gate.c +++ b/algo/x14/x14-gate.c @@ -2,7 +2,11 @@ bool register_x14_algo( algo_gate_t* gate ) { -#if defined (X14_4WAY) +#if defined (X14_8WAY) + init_x14_8way_ctx(); + gate->scanhash = (void*)&scanhash_x14_8way; + gate->hash = (void*)&x14_8way_hash; +#elif defined (X14_4WAY) init_x14_4way_ctx(); gate->scanhash = (void*)&scanhash_x14_4way; gate->hash = (void*)&x14_4way_hash; @@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x14; gate->hash = (void*)&x14hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h index 9df974f..97f4800 100644 --- a/algo/x14/x14-gate.h +++ b/algo/x14/x14-gate.h @@ -4,20 +4,29 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define X14_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X14_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X14_4WAY 1 #endif bool register_x14_algo( algo_gate_t* gate ); -#if defined(X14_4WAY) +#if defined(X14_8WAY) + +void x14_8way_hash( void *state, const void *input ); +int scanhash_x14_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x14_8way_ctx(); + +#elif defined(X14_4WAY) void x14_4way_hash( void *state, const void *input ); int scanhash_x14_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_x14_4way_ctx(); -#endif +#else void x14hash( void *state, const void *input ); int scanhash_x14( struct work *work, uint32_t max_nonce, @@ -26,3 +35,4 @@ void init_x14_ctx(); #endif +#endif diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index 87fe361..a761af0 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -1,7 +1,4 @@ #include "x15-gate.h" - -#if defined(X15_4WAY) - #include #include #include @@ -14,6 +11,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" @@ -23,6 +21,309 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" +#if defined(X15_8WAY) + + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; +} x15_8way_ctx_holder; + +x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64))); + +void init_x15_8way_ctx() +{ + blake512_8way_init( &x15_8way_ctx.blake ); + bmw512_8way_init( &x15_8way_ctx.bmw ); + init_groestl( &x15_8way_ctx.groestl, 64 ); + skein512_8way_init( &x15_8way_ctx.skein ); + jh512_8way_init( &x15_8way_ctx.jh ); + keccak512_8way_init( &x15_8way_ctx.keccak ); + luffa_4way_init( &x15_8way_ctx.luffa, 512 ); + cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x15_8way_ctx.shavite ); + simd_4way_init( &x15_8way_ctx.simd, 512 ); + init_echo( &x15_8way_ctx.echo, 512 ); + hamsi512_8way_init( &x15_8way_ctx.hamsi ); + sph_fugue512_init( &x15_8way_ctx.fugue ); + shabal512_8way_init( &x15_8way_ctx.shabal ); + sph_whirlpool_init( &x15_8way_ctx.whirlpool ); +}; + +void x15_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[4*8] __attribute__ ((aligned (64))); + uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + x15_8way_ctx_holder ctx; + memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) ); + + // 1 Blake + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + // 2 Bmw + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 3 Groestl + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + // 5 JH + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + // 6 Keccak + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + + // 9 Shavite + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + memcpy( &ctx.shavite, &x15_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + // 10 Simd + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + + // 11 Echo + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + + // 12 Hamsi parallel 4way 64 bit + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 13 Fugue + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + + // 14 Shabal, parallel 32 bit + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 15 Whirlpool + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool, + sizeof(sph_whirlpool_context) ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + memcpy( state, hash0, 32 ); + memcpy( state+ 32, hash1, 32 ); + memcpy( state+ 64, hash2, 32 ); + memcpy( state+ 96, hash3, 32 ); + memcpy( state+128, hash4, 32 ); + memcpy( state+160, hash5, 32 ); + memcpy( state+192, hash6, 32 ); + memcpy( state+224, hash7, 32 ); +} + +int scanhash_x15_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x15_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( ( hash+(i<<3) )[7] < Htarg ) + if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash, mythr, i ); + } + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(X15_4WAY) + typedef struct { blake512_4way_context blake; bmw512_4way_context bmw; @@ -64,11 +365,11 @@ void init_x15_4way_ctx() void x15_4way_hash( void *state, const void *input ) { + uint64_t vhash[8*4] __attribute__ ((aligned (128))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); x15_4way_ctx_holder ctx; memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) ); @@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input ) // 14 Shabal, parallel 32 bit intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -216,48 +517,37 @@ void x15_4way_hash( void *state, const void *input ) int scanhash_x15_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t hash[4*8] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - __m256i *noncev = (__m256i*)vdata + 9; // aligned + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 4; + __m256i *noncev = (__m256i*)vdata + 9; const uint32_t Htarg = ptarget[7]; - int thr_id = mythr->id; // thr_id arg is deprecated - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; - - + int thr_id = mythr->id; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x15_4way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 4; i++ ) + if ( ( hash+(i<<3) )[7] < Htarg ) + if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + pdata[19] = n+i; + submit_lane_solution( work, hash, mythr, i ); + } + n += 4; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); - x15_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( (hash+(i<<3))[7] & mask ) == 0 ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash, mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c index da33192..c148618 100644 --- a/algo/x15/x15-gate.c +++ b/algo/x15/x15-gate.c @@ -2,7 +2,11 @@ bool register_x15_algo( algo_gate_t* gate ) { -#if defined (X15_4WAY) +#if defined (X15_8WAY) + init_x15_8way_ctx(); + gate->scanhash = (void*)&scanhash_x15_8way; + gate->hash = (void*)&x15_8way_hash; +#elif defined (X15_4WAY) init_x15_4way_ctx(); gate->scanhash = (void*)&scanhash_x15_4way; gate->hash = (void*)&x15_4way_hash; @@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x15; gate->hash = (void*)&x15hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h index 8224fe2..44568c2 100644 --- a/algo/x15/x15-gate.h +++ b/algo/x15/x15-gate.h @@ -4,20 +4,30 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define X15_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X15_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X15_4WAY 1 #endif + bool register_x15_algo( algo_gate_t* gate ); -#if defined(X15_4WAY) +#if defined(X15_8WAY) + +void x15_8way_hash( void *state, const void *input ); +int scanhash_x15_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x15_8way_ctx(); + +#elif defined(X15_4WAY) void x15_4way_hash( void *state, const void *input ); int scanhash_x15_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_x15_4way_ctx(); -#endif +#else void x15hash( void *state, const void *input ); int scanhash_x15( struct work *work, uint32_t max_nonce, @@ -26,3 +36,5 @@ void init_x15_ctx(); #endif +#endif + diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 6fbd93f..d724c78 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -5,9 +5,6 @@ * Optimized by JayDDee@github Jan 2018 */ #include "x16r-gate.h" - -#if defined (X16R_4WAY) - #include #include #include @@ -20,6 +17,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" @@ -32,6 +30,392 @@ static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; +#if defined (X16R_8WAY) + +union _x16r_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; +} __attribute__ ((aligned (64))); + +typedef union _x16r_8way_context_overlay x16r_8way_context_overlay; + +void x16r_8way_hash( void* output, const void* input ) +{ + uint32_t vhash[24*8] __attribute__ ((aligned (128))); + uint32_t hash0[24] __attribute__ ((aligned (64))); + uint32_t hash1[24] __attribute__ ((aligned (64))); + uint32_t hash2[24] __attribute__ ((aligned (64))); + uint32_t hash3[24] __attribute__ ((aligned (64))); + uint32_t hash4[24] __attribute__ ((aligned (64))); + uint32_t hash5[24] __attribute__ ((aligned (64))); + uint32_t hash6[24] __attribute__ ((aligned (64))); + uint32_t hash7[24] __attribute__ ((aligned (64))); + x16r_8way_context_overlay ctx; + void *in0 = (void*) hash0; + void *in1 = (void*) hash1; + void *in2 = (void*) hash2; + void *in3 = (void*) hash3; + void *in4 = (void*) hash4; + void *in5 = (void*) hash5; + void *in6 = (void*) hash6; + void *in7 = (void*) hash7; + int size = 80; + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + input, 640 ); + + for ( int i = 0; i < 16; i++ ) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + blake512_8way_init( &ctx.blake ); + if ( i == 0 ) + blake512_8way_update( &ctx.blake, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + blake512_8way_update( &ctx.blake, vhash, size ); + } + blake512_8way_close( &ctx.blake, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case BMW: + bmw512_8way_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_8way_update( &ctx.bmw, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + bmw512_8way_update( &ctx.bmw, vhash, size ); + } + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case GROESTL: + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (const char*)in4, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (const char*)in5, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (const char*)in6, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (const char*)in7, size<<3 ); + break; + case SKEIN: + skein512_8way_init( &ctx.skein ); + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case JH: + jh512_8way_init( &ctx.jh ); + if ( i == 0 ) + jh512_8way_update( &ctx.jh, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + jh512_8way_update( &ctx.jh, vhash, size ); + } + jh512_8way_close( &ctx.jh, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case KECCAK: + keccak512_8way_init( &ctx.keccak ); + if ( i == 0 ) + keccak512_8way_update( &ctx.keccak, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + keccak512_8way_update( &ctx.keccak, vhash, size ); + } + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case LUFFA: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, size); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case CUBEHASH: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case SHAVITE: + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in0, size ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in1, size ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in2, size ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in3, size ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in4, size ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in5, size ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in6, size ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in7, size ); + sph_shavite512_close( &ctx.shavite, hash7 ); + break; + case SIMD: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case ECHO: + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash4, + (const BitSequence*)in4, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash5, + (const BitSequence*)in5, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash6, + (const BitSequence*)in6, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash7, + (const BitSequence*)in7, size<<3 ); + break; + case HAMSI: + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case FUGUE: + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in0, size ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in1, size ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in2, size ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in3, size ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in4, size ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in5, size ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in6, size ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in7, size ); + sph_fugue512_close( &ctx.fugue, hash7 ); + break; + case SHABAL: + intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + break; + case SHA_512: + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, size ); + sha512_8way_close( &ctx.sha512, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + } + size = 64; + } + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); + memcpy( output+128, hash4, 32 ); + memcpy( output+160, hash5, 32 ); + memcpy( output+192, hash6, 32 ); + memcpy( output+224, hash7, 32 ); +} + +int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t bedata1[2] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + bedata1[0] = bswap_32( pdata[1] ); + bedata1[1] = bswap_32( pdata[2] ); + const uint32_t ntime = bswap_32( pdata[17] ); + if ( s_ntime != ntime ) + { + x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + } + + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x16r_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) + if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 8; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + + *hashes_done = n - first_nonce; + return 0; +} + + +#elif defined (X16R_4WAY) + union _x16r_4way_context_overlay { blake512_4way_context blake; @@ -50,16 +434,16 @@ union _x16r_4way_context_overlay shabal512_4way_context shabal; sph_whirlpool_context whirlpool; sha512_4way_context sha512; -}; +} __attribute__ ((aligned (64))); typedef union _x16r_4way_context_overlay x16r_4way_context_overlay; void x16r_4way_hash( void* output, const void* input ) { + uint32_t vhash[24*4] __attribute__ ((aligned (128))); uint32_t hash0[24] __attribute__ ((aligned (64))); uint32_t hash1[24] __attribute__ ((aligned (64))); uint32_t hash2[24] __attribute__ ((aligned (64))); uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); x16r_4way_context_overlay ctx; void *in0 = (void*) hash0; void *in1 = (void*) hash1; @@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input ) blake512_4way( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case BMW: bmw512_4way_init( &ctx.bmw ); @@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input ) bmw512_4way( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: init_groestl( &ctx.groestl, 64 ); @@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input ) skein512_4way( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case JH: jh512_4way_init( &ctx.jh ); @@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input ) jh512_4way( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: keccak512_4way_init( &ctx.keccak ); @@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input ) keccak512_4way( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case LUFFA: intrlv_2x128( vhash, in0, in1, size<<3 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); + dintrlv_2x128_512( hash0, hash1, vhash ); intrlv_2x128( vhash, in2, in3, size<<3 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + dintrlv_2x128_512( hash2, hash3, vhash ); break; case CUBEHASH: cubehashInit( &ctx.cube, 512, 16, 32 ); @@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input ) intrlv_2x128( vhash, in0, in1, size<<3 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); + dintrlv_2x128_512( hash0, hash1, vhash ); intrlv_2x128( vhash, in2, in3, size<<3 ); simd_2way_init( &ctx.simd, 512 ); simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + dintrlv_2x128_512( hash2, hash3, vhash ); break; case ECHO: init_echo( &ctx.echo, 512 ); @@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input ) hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input ) shabal512_4way_init( &ctx.shabal ); shabal512_4way( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); break; case WHIRLPOOL: sph_whirlpool_init( &ctx.whirlpool ); @@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input ) sha512_4way_init( &ctx.sha512 ); sha512_4way( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; } size = 64; @@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; __m256i *noncev = (__m256i*)vdata + 9; // aligned int thr_id = mythr->id; @@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, submit_lane_solution( work, hash+(i<<3), mythr, i ); } n += 4; - } while ( likely( ( n < max_nonce ) && !(*restart) ) ); + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - *hashes_done = n - first_nonce + 1; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 1480813..7d76c0d 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output ) bool register_x16r_algo( algo_gate_t* gate ) { -#if defined (X16R_4WAY) +#if defined (X16R_8WAY) + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->hash = (void*)&x16r_8way_hash; +#elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; #else gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate ) bool register_x16rv2_algo( algo_gate_t* gate ) { -#if defined (X16R_4WAY) +#if defined (X16R_8WAY) + gate->scanhash = (void*)&scanhash_x16rv2_8way; + gate->hash = (void*)&x16rv2_8way_hash; +#elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16rv2_4way; gate->hash = (void*)&x16rv2_4way_hash; #else gate->scanhash = (void*)&scanhash_x16rv2; gate->hash = (void*)&x16rv2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate ) bool register_x16s_algo( algo_gate_t* gate ) { -#if defined (X16R_4WAY) +#if defined (X16R_8WAY) + gate->scanhash = (void*)&scanhash_x16r_8way; + gate->hash = (void*)&x16r_8way_hash; +#elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16r_4way; gate->hash = (void*)&x16r_4way_hash; #else gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; @@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_x16rt_algo( algo_gate_t* gate ) { -#if defined (X16R_4WAY) +#if defined (X16R_8WAY) + gate->scanhash = (void*)&scanhash_x16rt_8way; + gate->hash = (void*)&x16rt_8way_hash; +#elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16rt_4way; gate->hash = (void*)&x16rt_4way_hash; #else gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16rt_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; opt_target_factor = 256.0; return true; }; bool register_x16rt_veil_algo( algo_gate_t* gate ) { -#if defined (X16R_4WAY) +#if defined (X16R_8WAY) + gate->scanhash = (void*)&scanhash_x16rt_8way; + gate->hash = (void*)&x16rt_8way_hash; +#elif defined (X16R_4WAY) gate->scanhash = (void*)&scanhash_x16rt_4way; gate->hash = (void*)&x16rt_4way_hash; #else gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16rt_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate ) { gate->scanhash = (void*)&scanhash_hex; gate->hash = (void*)&hex_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; opt_target_factor = 128.0; return true; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index dd4c216..ff6d44d 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -6,8 +6,10 @@ #include #include -#if defined(__AVX2__) && defined(__AES__) - #define X16R_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X16R_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X16R_4WAY 1 #endif enum x16r_Algo { @@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate ); bool register_hex__algo( algo_gate_t* gate ); bool register_x21s__algo( algo_gate_t* gate ); -#if defined(X16R_4WAY) +#if defined(X16R_8WAY) + +void x16r_8way_hash( void *state, const void *input ); +int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +void x16rv2_8way_hash( void *state, const void *input ); +int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void x16rt_8way_hash( void *state, const void *input ); +int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(X16R_4WAY) void x16r_4way_hash( void *state, const void *input ); int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, @@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input ); int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void x21s_4way_hash( void *state, const void *input ); -int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool x21s_4way_thread_init(); - -#endif +#else void x16r_hash( void *state, const void *input ); int scanhash_x16r( struct work *work, uint32_t max_nonce, @@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input ); int scanhash_x16rt( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void hex_hash( void *state, const void *input ); -int scanhash_hex( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +#endif + +#if defined(X16R_4WAY) + +void x21s_4way_hash( void *state, const void *input ); +int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool x21s_4way_thread_init(); + +#else void x21s_hash( void *state, const void *input ); int scanhash_x21s( struct work *work, uint32_t max_nonce, @@ -88,3 +105,9 @@ bool x21s_thread_init(); #endif +void hex_hash( void *state, const void *input ); +int scanhash_hex( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index 4e28d40..8118bc8 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -1,7 +1,4 @@ #include "x16r-gate.h" - -#if defined (X16R_4WAY) - #include #include #include @@ -15,6 +12,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -26,6 +24,391 @@ static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; +#if defined (X16R_8WAY) + +union _x16rt_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; +} __attribute__ ((aligned (64))); + +typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay; + +void x16rt_8way_hash( void* output, const void* input ) +{ + uint32_t vhash[24*8] __attribute__ ((aligned (128))); + uint32_t hash0[24] __attribute__ ((aligned (64))); + uint32_t hash1[24] __attribute__ ((aligned (64))); + uint32_t hash2[24] __attribute__ ((aligned (64))); + uint32_t hash3[24] __attribute__ ((aligned (64))); + uint32_t hash4[24] __attribute__ ((aligned (64))); + uint32_t hash5[24] __attribute__ ((aligned (64))); + uint32_t hash6[24] __attribute__ ((aligned (64))); + uint32_t hash7[24] __attribute__ ((aligned (64))); + x16rt_8way_context_overlay ctx; + void *in0 = (void*) hash0; + void *in1 = (void*) hash1; + void *in2 = (void*) hash2; + void *in3 = (void*) hash3; + void *in4 = (void*) hash4; + void *in5 = (void*) hash5; + void *in6 = (void*) hash6; + void *in7 = (void*) hash7; + int size = 80; + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + input, 640 ); + + for ( int i = 0; i < 16; i++ ) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + blake512_8way_init( &ctx.blake ); + if ( i == 0 ) + blake512_8way_update( &ctx.blake, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + blake512_8way_update( &ctx.blake, vhash, size ); + } + blake512_8way_close( &ctx.blake, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case BMW: + bmw512_8way_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_8way_update( &ctx.bmw, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + bmw512_8way_update( &ctx.bmw, vhash, size ); + } + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case GROESTL: + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (const char*)in4, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (const char*)in5, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (const char*)in6, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (const char*)in7, size<<3 ); + break; + case SKEIN: + skein512_8way_init( &ctx.skein ); + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case JH: + jh512_8way_init( &ctx.jh ); + if ( i == 0 ) + jh512_8way_update( &ctx.jh, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + jh512_8way_update( &ctx.jh, vhash, size ); + } + jh512_8way_close( &ctx.jh, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case KECCAK: + keccak512_8way_init( &ctx.keccak ); + if ( i == 0 ) + keccak512_8way_update( &ctx.keccak, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + keccak512_8way_update( &ctx.keccak, vhash, size ); + } + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case LUFFA: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, size); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case CUBEHASH: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case SHAVITE: + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in0, size ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in1, size ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in2, size ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in3, size ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in4, size ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in5, size ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in6, size ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in7, size ); + sph_shavite512_close( &ctx.shavite, hash7 ); + break; + case SIMD: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case ECHO: + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash4, + (const BitSequence*)in4, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash5, + (const BitSequence*)in5, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash6, + (const BitSequence*)in6, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash7, + (const BitSequence*)in7, size<<3 ); + break; + case HAMSI: + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case FUGUE: + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in0, size ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in1, size ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in2, size ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in3, size ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in4, size ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in5, size ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in6, size ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in7, size ); + sph_fugue512_close( &ctx.fugue, hash7 ); + break; + case SHABAL: + intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + break; + case SHA_512: + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, size ); + sha512_8way_close( &ctx.sha512, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + } + size = 64; + } + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); + memcpy( output+128, hash4, 32 ); + memcpy( output+160, hash5, 32 ); + memcpy( output+192, hash6, 32 ); + memcpy( output+224, hash7, 32 ); +} + +int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t _ALIGN(64) timeHash[8*8]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + uint32_t ntime = bswap_32( pdata[17] ); + if ( s_ntime != ntime ) + { + x16rt_getTimeHash( ntime, &timeHash ); + x16rt_getAlgoString( &timeHash[0], hashOrder ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", + hashOrder, ntime, timeHash ); + } + + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x16rt_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) + if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 8; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (X16R_4WAY) + union _x16rt_4way_context_overlay { blake512_4way_context blake; diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index 6cbb0f2..7dd6306 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -5,9 +5,6 @@ * Optimized by JayDDee@github Jan 2018 */ #include "x16r-gate.h" - -#if defined (X16R_4WAY) - #include #include #include @@ -21,6 +18,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -33,6 +31,477 @@ static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; +#if defined (X16R_8WAY) + +union _x16rv2_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + sph_tiger_context tiger; +} __attribute__ ((aligned (64))); + +typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay; + +void x16rv2_8way_hash( void* output, const void* input ) +{ + uint32_t vhash[24*8] __attribute__ ((aligned (128))); + uint32_t hash0[24] __attribute__ ((aligned (64))); + uint32_t hash1[24] __attribute__ ((aligned (64))); + uint32_t hash2[24] __attribute__ ((aligned (64))); + uint32_t hash3[24] __attribute__ ((aligned (64))); + uint32_t hash4[24] __attribute__ ((aligned (64))); + uint32_t hash5[24] __attribute__ ((aligned (64))); + uint32_t hash6[24] __attribute__ ((aligned (64))); + uint32_t hash7[24] __attribute__ ((aligned (64))); + x16rv2_8way_context_overlay ctx; + void *in0 = (void*) hash0; + void *in1 = (void*) hash1; + void *in2 = (void*) hash2; + void *in3 = (void*) hash3; + void *in4 = (void*) hash4; + void *in5 = (void*) hash5; + void *in6 = (void*) hash6; + void *in7 = (void*) hash7; + int size = 80; + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + input, 640 ); + + for ( int i = 0; i < 16; i++ ) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + blake512_8way_init( &ctx.blake ); + if ( i == 0 ) + blake512_8way_update( &ctx.blake, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + blake512_8way_update( &ctx.blake, vhash, size ); + } + blake512_8way_close( &ctx.blake, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case BMW: + bmw512_8way_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_8way_update( &ctx.bmw, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + bmw512_8way_update( &ctx.bmw, vhash, size ); + } + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case GROESTL: + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (const char*)in4, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (const char*)in5, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (const char*)in6, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (const char*)in7, size<<3 ); + break; + case SKEIN: + skein512_8way_init( &ctx.skein ); + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case JH: + jh512_8way_init( &ctx.jh ); + if ( i == 0 ) + jh512_8way_update( &ctx.jh, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + jh512_8way_update( &ctx.jh, vhash, size ); + } + jh512_8way_close( &ctx.jh, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case KECCAK: + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in4, size ); + sph_tiger_close( &ctx.tiger, hash4 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in5, size ); + sph_tiger_close( &ctx.tiger, hash5 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in6, size ); + sph_tiger_close( &ctx.tiger, hash6 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in7, size ); + sph_tiger_close( &ctx.tiger, hash7 ); + + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = hash2[i] = hash3[i] = + hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0; + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7 ); + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case LUFFA: + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in4, size ); + sph_tiger_close( &ctx.tiger, hash4 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in5, size ); + sph_tiger_close( &ctx.tiger, hash5 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in6, size ); + sph_tiger_close( &ctx.tiger, hash6 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in7, size ); + sph_tiger_close( &ctx.tiger, hash7 ); + + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = hash2[i] = hash3[i] = + hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0; + + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case CUBEHASH: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case SHAVITE: + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in0, size ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in1, size ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in2, size ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in3, size ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in4, size ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in5, size ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in6, size ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in7, size ); + sph_shavite512_close( &ctx.shavite, hash7 ); + break; + case SIMD: + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + break; + case ECHO: + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash4, + (const BitSequence*)in4, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash5, + (const BitSequence*)in5, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash6, + (const BitSequence*)in6, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash7, + (const BitSequence*)in7, size<<3 ); + break; + case HAMSI: + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case FUGUE: + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in0, size ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in1, size ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in2, size ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in3, size ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in4, size ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in5, size ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in6, size ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in7, size ); + sph_fugue512_close( &ctx.fugue, hash7 ); + break; + case SHABAL: + intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + break; + case SHA_512: + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in4, size ); + sph_tiger_close( &ctx.tiger, hash4 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in5, size ); + sph_tiger_close( &ctx.tiger, hash5 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in6, size ); + sph_tiger_close( &ctx.tiger, hash6 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in7, size ); + sph_tiger_close( &ctx.tiger, hash7 ); + + for ( int i = (24/4); i < (64/4); i++ ) + hash0[i] = hash1[i] = hash2[i] = hash3[i] = + hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0; + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7 ); + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; + } + size = 64; + } + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); + memcpy( output+128, hash4, 32 ); + memcpy( output+160, hash5, 32 ); + memcpy( output+192, hash6, 32 ); + memcpy( output+224, hash7, 32 ); +} + +int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t bedata1[2] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + bedata1[0] = bswap_32( pdata[1] ); + bedata1[1] = bswap_32( pdata[2] ); + const uint32_t ntime = bswap_32( pdata[17] ); + if ( s_ntime != ntime ) + { + x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + } + + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x16rv2_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) + if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 8; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + + *hashes_done = n - first_nonce; + return 0; +} + + +#elif defined (X16R_4WAY) + + + union _x16rv2_4way_context_overlay { blake512_4way_context blake; diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index 4406529..3a0b248 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -1,7 +1,4 @@ #include "sonoa-gate.h" - -#if defined(SONOA_4WAY) - #include #include #include @@ -25,6 +22,1338 @@ #include "algo/haval/haval-hash-4way.h" #include "algo/sha/sha-hash-4way.h" +#if defined(SONOA_8WAY) + +union _sonoa_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + haval256_5_8way_context haval; +} __attribute__ ((aligned (64))); + +typedef union _sonoa_8way_context_overlay sonoa_8way_context_overlay; + +void sonoa_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhashA[8*8] __attribute__ ((aligned (64))); + uint64_t vhashB[8*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + sonoa_8way_context_overlay ctx; + +// 1 + + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + +// 2 + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + +// 3 + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + +// 4 + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + rintrlv_8x32_8x64( vhashA, vhash, 512 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhashA, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + +// 5 + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + rintrlv_8x64_8x32( vhashA, vhash, 512 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhashA, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + +// 6 + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + +// 7 + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + + rintrlv_8x64_8x32( vhashA, vhash, 512 ); + + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhashA, 64 ); + haval256_5_8way_close( &ctx.haval, state ); +} + +int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + sonoa_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if unlikely( ( hash7[ lane ] <= Htarg ) ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + + *hashes_done = n - first_nonce; + return 0; +} + + + +#elif defined(SONOA_4WAY) + union _sonoa_4way_context_overlay { blake512_4way_context blake; diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c index fea4d39..3687733 100644 --- a/algo/x17/sonoa-gate.c +++ b/algo/x17/sonoa-gate.c @@ -2,8 +2,10 @@ bool register_sonoa_algo( algo_gate_t* gate ) { -#if defined (SONOA_4WAY) -// init_sonoa_4way_ctx(); +#if defined (SONOA_8WAY) + gate->scanhash = (void*)&scanhash_sonoa_8way; + gate->hash = (void*)&sonoa_8way_hash; +#elif defined (SONOA_4WAY) gate->scanhash = (void*)&scanhash_sonoa_4way; gate->hash = (void*)&sonoa_4way_hash; #else @@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_sonoa; gate->hash = (void*)&sonoa_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x17/sonoa-gate.h b/algo/x17/sonoa-gate.h index c97a375..aaad2a4 100644 --- a/algo/x17/sonoa-gate.h +++ b/algo/x17/sonoa-gate.h @@ -4,29 +4,33 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define SONOA_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define SONOA_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define SONOA_4WAY 1 #endif bool register_sonoa_algo( algo_gate_t* gate ); -#if defined(SONOA_4WAY) +#if defined(SONOA_8WAY) + +void sonoa_8way_hash( void *state, const void *input ); +int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#elif defined(SONOA_4WAY) void sonoa_4way_hash( void *state, const void *input ); - int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -//void init_sonoa_4way_ctx(); - -#endif +#else void sonoa_hash( void *state, const void *input ); - int scanhash_sonoa( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_sonoa_ctx(); #endif +#endif diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index f913644..18eed41 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -1,7 +1,4 @@ #include "x17-gate.h" - -#if defined(X17_4WAY) - #include #include #include @@ -14,6 +11,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cube-hash-2way.h" +#include "algo/shavite/sph_shavite.h" #include "algo/shavite/shavite-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" @@ -24,6 +22,309 @@ #include "algo/haval/haval-hash-4way.h" #include "algo/sha/sha-hash-4way.h" +#if defined(X17_8WAY) + +union _x17_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + haval256_5_8way_context haval; +} __attribute__ ((aligned (64))); +typedef union _x17_8way_context_overlay x17_8way_context_overlay; + +void x17_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhash0[8*8] __attribute__ ((aligned (64))); + uint64_t vhash1[8*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + x17_8way_context_overlay ctx; + + // 1 Blake parallel 4 way 64 bit + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + + // 2 Bmw + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + + // Serialize + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 3 Groestl + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + // Parallellize + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + // 4 Skein parallel 4 way 64 bit + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + // 5 JH + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + // 6 Keccak + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + + // 7 Luffa + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + + // 8 Cubehash + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + + // 9 Shavite + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + // 10 Simd + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + + + // 11 Echo serial + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + // 12 Hamsi parallel 4 way 64 bit + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 13 Fugue serial + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + // 14 Shabal, parallel 4 way 32 bit + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + + // 15 Whirlpool serial + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + // 16 SHA512 parallel 64 bit + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + + // 17 Haval parallel 32 bit + rintrlv_8x64_8x32( vhash0, vhash, 512 ); + + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhash0, 64 ); + haval256_5_8way_close( &ctx.haval, state ); +} + +int scanhash_x17_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + x17_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if unlikely( ( hash7[ lane ] <= Htarg ) ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(X17_4WAY) + union _x17_4way_context_overlay { blake512_4way_context blake; diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index 69d28f6..73ce607 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -2,14 +2,17 @@ bool register_x17_algo( algo_gate_t* gate ) { -#if defined (X17_4WAY) +#if defined (X17_8WAY) + gate->scanhash = (void*)&scanhash_x17_8way; + gate->hash = (void*)&x17_8way_hash; +#elif defined (X17_4WAY) gate->scanhash = (void*)&scanhash_x17_4way; gate->hash = (void*)&x17_4way_hash; #else gate->scanhash = (void*)&scanhash_x17; gate->hash = (void*)&x17_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; return true; }; diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 9a40b34..014caef 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -4,13 +4,20 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define X17_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X17_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X17_4WAY 1 #endif bool register_x17_algo( algo_gate_t* gate ); -#if defined(X17_4WAY) +#if defined(X17_8WAY) + +void x17_8way_hash( void *state, const void *input ); +int scanhash_x17_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#elif defined(X17_4WAY) void x17_4way_hash( void *state, const void *input ); int scanhash_x17_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 91a2a9f..28bc1c2 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -1,7 +1,4 @@ #include "xevan-gate.h" - -#if defined(XEVAN_4WAY) - #include #include #include @@ -15,6 +12,7 @@ #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/shavite-hash-2way.h" +#include "algo/shavite/sph_shavite.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" @@ -25,6 +23,515 @@ #include "algo/sha/sha-hash-4way.h" #include "algo/haval/haval-hash-4way.h" +#if defined(XEVAN_8WAY) + +union _xevan_8way_context_overlay +{ + blake512_8way_context blake; + bmw512_8way_context bmw; + hashState_groestl groestl; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + sph_shavite512_context shavite; + simd_4way_context simd; + hashState_echo echo; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + haval256_5_8way_context haval; +} __attribute__ ((aligned (64))); +typedef union _xevan_8way_context_overlay xevan_8way_context_overlay; + +void xevan_8way_hash( void *output, const void *input ) +{ + uint64_t vhash[16<<3] __attribute__ ((aligned (128))); + uint64_t vhashA[16<<3] __attribute__ ((aligned (64))); + uint64_t vhashB[16<<3] __attribute__ ((aligned (64))); + uint64_t hash0[16] __attribute__ ((aligned (64))); + uint64_t hash1[16] __attribute__ ((aligned (64))); + uint64_t hash2[16] __attribute__ ((aligned (64))); + uint64_t hash3[16] __attribute__ ((aligned (64))); + uint64_t hash4[16] __attribute__ ((aligned (64))); + uint64_t hash5[16] __attribute__ ((aligned (64))); + uint64_t hash6[16] __attribute__ ((aligned (64))); + uint64_t hash7[16] __attribute__ ((aligned (64))); + const int dataLen = 128; + xevan_8way_context_overlay ctx __attribute__ ((aligned (64))); + + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + memset( &vhash[8<<3], 0, 64<<3 ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, dataLen ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, + dataLen<<3 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, dataLen ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, dataLen ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, dataLen ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, dataLen ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, dataLen ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, dataLen ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, dataLen ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, dataLen ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, dataLen ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, dataLen ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, dataLen ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); + intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, dataLen<<3 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, dataLen ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, dataLen ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, dataLen ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, dataLen ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, dataLen ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, dataLen ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, dataLen ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, dataLen ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, dataLen ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, dataLen ); + sha512_8way_close( &ctx.sha512, vhash ); + + rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 ); + + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhashA, dataLen ); + haval256_5_8way_close( &ctx.haval, vhashA ); + + rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 ); + + memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 ); + + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, vhash, dataLen ); + blake512_8way_close(&ctx.blake, vhash); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, dataLen ); + bmw512_8way_close( &ctx.bmw, vhash ); + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, + dataLen<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, + dataLen<<3 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, dataLen ); + skein512_8way_close( &ctx.skein, vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, dataLen ); + jh512_8way_close( &ctx.jh, vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, dataLen ); + keccak512_8way_close( &ctx.keccak, vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); + + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash0, dataLen ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash1, dataLen ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash2, dataLen ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash3, dataLen ); + sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash4, dataLen ); + sph_shavite512_close( &ctx.shavite, hash4 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash5, dataLen ); + sph_shavite512_close( &ctx.shavite, hash5 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash6, dataLen ); + sph_shavite512_close( &ctx.shavite, hash6 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash7, dataLen ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); + intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); + + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); + dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); + + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, dataLen<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, dataLen<<3 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash0, dataLen ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash1, dataLen ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash2, dataLen ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash3, dataLen ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash4, dataLen ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash5, dataLen ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash6, dataLen ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, hash7, dataLen ); + sph_fugue512_close( &ctx.fugue, hash7 ); + + intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, dataLen ); + shabal512_8way_close( &ctx.shabal, vhash ); + + dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash, dataLen<<3 ); + + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash0, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash1, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash2, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash4, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash5, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash6, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash7, dataLen ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, dataLen<<3 ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, dataLen ); + sha512_8way_close( &ctx.sha512, vhash ); + + rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 ); + + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhashA, dataLen ); + haval256_5_8way_close( &ctx.haval, output ); +} + +int scanhash_xevan_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + xevan_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) + if unlikely( ( hash7[ lane ] <= Htarg ) ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(XEVAN_4WAY) + union _xevan_4way_context_overlay { blake512_4way_context blake; diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c index 96b811c..8cb86a4 100644 --- a/algo/x17/xevan-gate.c +++ b/algo/x17/xevan-gate.c @@ -2,8 +2,10 @@ bool register_xevan_algo( algo_gate_t* gate ) { -#if defined (XEVAN_4WAY) -// init_xevan_4way_ctx(); +#if defined (XEVAN_8WAY) + gate->scanhash = (void*)&scanhash_xevan_8way; + gate->hash = (void*)&xevan_8way_hash; +#elif defined (XEVAN_4WAY) gate->scanhash = (void*)&scanhash_xevan_4way; gate->hash = (void*)&xevan_4way_hash; #else diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h index c614c0b..be0dfbc 100644 --- a/algo/x17/xevan-gate.h +++ b/algo/x17/xevan-gate.h @@ -4,13 +4,21 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) - #define XEVAN_4WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define XEVAN_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define XEVAN_4WAY 1 #endif bool register_xevan_algo( algo_gate_t* gate ); -#if defined(XEVAN_4WAY) +#if defined(XEVAN_8WAY) + +void xevan_8way_hash( void *state, const void *input ); + +int scanhash_xevan_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#elif defined(XEVAN_4WAY) void xevan_4way_hash( void *state, const void *input ); @@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce, //void init_xevan_4way_ctx(); -#endif +#else void xevan_hash( void *state, const void *input ); @@ -30,3 +38,4 @@ void init_xevan_ctx(); #endif +#endif diff --git a/build-allarch.sh b/build-allarch.sh index e6ab8d5..6e8fd89 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,6 +4,8 @@ # during develpment. However the information contained may provide compilation # tips to users. +rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen + make distclean || echo clean rm -f config.status ./autogen.sh || echo done diff --git a/configure b/configure index 94be3e8..2c54f15 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.10.2' -PACKAGE_STRING='cpuminer-opt 3.10.2' +PACKAGE_VERSION='3.10.5' +PACKAGE_STRING='cpuminer-opt 3.10.5' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.10.2 +cpuminer-opt configure 3.10.5 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.10.2, which was +It was created by cpuminer-opt $as_me 3.10.5, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.10.2' + VERSION='3.10.5' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.10.2, which was +This file was extended by cpuminer-opt $as_me 3.10.5, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.10.2 +cpuminer-opt config.status 3.10.5 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index eae85ca..467397e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.10.2]) +AC_INIT([cpuminer-opt], [3.10.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 85cdb40..572cdef 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3410,39 +3410,39 @@ bool check_cpu_capability () printf(".\n"); #endif - printf("CPU features:"); - if ( cpu_has_vaes ) printf( " VAES" ); - else if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sha ) printf( " SHA" ); + printf("CPU features: "); if ( cpu_has_avx512 ) printf( " AVX512" ); - else if ( cpu_has_avx2 ) printf( " AVX2" ); - else if ( cpu_has_avx ) printf( " AVX" ); + else if ( cpu_has_avx2 ) printf( " AVX2 " ); + else if ( cpu_has_avx ) printf( " AVX " ); else if ( cpu_has_sse42 ) printf( " SSE4.2" ); - else if ( cpu_has_sse2 ) printf( " SSE2" ); + else if ( cpu_has_sse2 ) printf( " SSE2 " ); + if ( cpu_has_vaes ) printf( " VAES" ); + else if ( cpu_has_aes ) printf( " AES" ); + if ( cpu_has_sha ) printf( " SHA" ); - printf(".\nSW features:"); - if ( sw_has_vaes ) printf( " VAES" ); - else if ( sw_has_aes ) printf( " AES" ); - if ( sw_has_sha ) printf( " SHA" ); + printf("\nSW features: "); if ( sw_has_avx512 ) printf( " AVX512" ); - else if ( sw_has_avx2 ) printf( " AVX2" ); - else if ( sw_has_avx ) printf( " AVX" ); + else if ( sw_has_avx2 ) printf( " AVX2 " ); + else if ( sw_has_avx ) printf( " AVX " ); else if ( sw_has_sse42 ) printf( " SSE4.2" ); - else if ( sw_has_sse2 ) printf( " SSE2" ); + else if ( sw_has_sse2 ) printf( " SSE2 " ); + if ( sw_has_vaes ) printf( " VAES" ); + else if ( sw_has_aes ) printf( " AES " ); + if ( sw_has_sha ) printf( " SHA" ); - printf(".\nAlgo features:"); + printf("\nAlgo features:"); if ( algo_features == EMPTY_SET ) printf( " None" ); else { - if ( algo_has_vaes ) printf( " VAES" ); - else if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sha ) printf( " SHA" ); if ( algo_has_avx512 ) printf( " AVX512" ); - else if ( algo_has_avx2 ) printf( " AVX2" ); + else if ( algo_has_avx2 ) printf( " AVX2 " ); else if ( algo_has_sse42 ) printf( " SSE4.2" ); - else if ( algo_has_sse2 ) printf( " SSE2" ); + else if ( algo_has_sse2 ) printf( " SSE2 " ); + if ( algo_has_vaes ) printf( " VAES" ); + else if ( algo_has_aes ) printf( " AES " ); + if ( algo_has_sha ) printf( " SHA" ); } - printf(".\n"); + printf("\n"); // Check for CPU and build incompatibilities if ( !cpu_has_sse2 ) @@ -3483,19 +3483,19 @@ bool check_cpu_capability () use_sha || use_vaes ); // Display best options - printf( "Start mining with" ); + printf( "\nStarting miner with" ); if ( use_none ) printf( " no optimizations" ); else { - if ( use_vaes ) printf( " VAES" ); - else if ( use_aes ) printf( " AES" ); if ( use_avx512 ) printf( " AVX512" ); else if ( use_avx2 ) printf( " AVX2" ); else if ( use_sse42 ) printf( " SSE4.2" ); else if ( use_sse2 ) printf( " SSE2" ); + if ( use_vaes ) printf( " VAES" ); + else if ( use_aes ) printf( " AES" ); if ( use_sha ) printf( " SHA" ); } - printf( ".\n\n" ); + printf( "...\n\n" ); return true; } diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index db9c0e9..961c57d 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00, *( (uint32_t*)(d06) +(i) ) = s[ 6]; \ *( (uint32_t*)(d07) +(i) ) = s[ 7]; \ *( (uint32_t*)(d08) +(i) ) = s[ 8]; \ - *( (uint32_t*)(d09) +(i) ) = s[ 0]; \ + *( (uint32_t*)(d09) +(i) ) = s[ 9]; \ *( (uint32_t*)(d10) +(i) ) = s[10]; \ *( (uint32_t*)(d11) +(i) ) = s[11]; \ *( (uint32_t*)(d12) +(i) ) = s[12]; \ @@ -2055,7 +2055,7 @@ static inline void intrlv_2x256( void *dst, const void *src0, if ( bit_len <= 512 ) return; d[4] = s0[2]; if ( bit_len <= 640 ) return; - d[5] = s1[2]; + d[5] = s1[2]; d[6] = s0[3]; d[7] = s1[3]; } @@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1, d0[3] = s[6]; d1[3] = s[7]; } - - - #endif // AVX /////////////////////////// @@ -2165,7 +2162,9 @@ static inline void rintrlv_4x32_4x64( void *dst, d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] ); d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] ); d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] ); + if ( bit_len <= 256 ) return; + d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] ); d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] ); d[10] = _mm_unpacklo_epi32( s[10], s[11] ); @@ -2174,16 +2173,21 @@ static inline void rintrlv_4x32_4x64( void *dst, d[13] = _mm_unpackhi_epi32( s[12], s[13] ); d[14] = _mm_unpacklo_epi32( s[14], s[15] ); d[15] = _mm_unpackhi_epi32( s[14], s[15] ); + if ( bit_len <= 512 ) return; + d[16] = _mm_unpacklo_epi32( s[16], s[17] ); d[17] = _mm_unpackhi_epi32( s[16], s[17] ); d[18] = _mm_unpacklo_epi32( s[18], s[19] ); d[19] = _mm_unpackhi_epi32( s[18], s[19] ); + if ( bit_len <= 640 ) return; + d[20] = _mm_unpacklo_epi32( s[20], s[21] ); d[21] = _mm_unpackhi_epi32( s[20], s[21] ); d[22] = _mm_unpacklo_epi32( s[22], s[23] ); d[23] = _mm_unpackhi_epi32( s[22], s[23] ); + d[24] = _mm_unpacklo_epi32( s[24], s[25] ); d[25] = _mm_unpackhi_epi32( s[24], s[25] ); d[26] = _mm_unpacklo_epi32( s[26], s[27] ); @@ -2194,6 +2198,93 @@ static inline void rintrlv_4x32_4x64( void *dst, d[31] = _mm_unpackhi_epi32( s[30], s[31] ); } +// 8x32 -> 8x64 + +static inline void rintrlv_8x32_8x64( void *dst, + const void *src, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + + d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] ); + d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] ); + d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] ); + d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] ); + d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] ); + d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] ); + d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] ); + d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] ); + + d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] ); + d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] ); + d[10] = _mm_unpacklo_epi32( s[ 9], s[11] ); + d[11] = _mm_unpackhi_epi32( s[ 9], s[11] ); + d[12] = _mm_unpacklo_epi32( s[12], s[14] ); + d[13] = _mm_unpackhi_epi32( s[12], s[14] ); + d[14] = _mm_unpacklo_epi32( s[13], s[15] ); + d[15] = _mm_unpackhi_epi32( s[13], s[15] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi32( s[16], s[18] ); + d[17] = _mm_unpackhi_epi32( s[16], s[18] ); + d[18] = _mm_unpacklo_epi32( s[17], s[19] ); + d[19] = _mm_unpackhi_epi32( s[17], s[19] ); + d[20] = _mm_unpacklo_epi32( s[20], s[22] ); + d[21] = _mm_unpackhi_epi32( s[20], s[22] ); + d[22] = _mm_unpacklo_epi32( s[21], s[23] ); + d[23] = _mm_unpackhi_epi32( s[21], s[23] ); + + d[24] = _mm_unpacklo_epi32( s[24], s[26] ); + d[25] = _mm_unpackhi_epi32( s[24], s[26] ); + d[26] = _mm_unpacklo_epi32( s[25], s[27] ); + d[27] = _mm_unpackhi_epi32( s[25], s[27] ); + d[28] = _mm_unpacklo_epi32( s[28], s[30] ); + d[29] = _mm_unpackhi_epi32( s[28], s[30] ); + d[30] = _mm_unpacklo_epi32( s[29], s[31] ); + d[31] = _mm_unpackhi_epi32( s[29], s[31] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi32( s[32], s[34] ); + d[33] = _mm_unpackhi_epi32( s[32], s[34] ); + d[34] = _mm_unpacklo_epi32( s[33], s[35] ); + d[35] = _mm_unpackhi_epi32( s[33], s[35] ); + d[36] = _mm_unpacklo_epi32( s[36], s[38] ); + d[37] = _mm_unpackhi_epi32( s[36], s[38] ); + d[38] = _mm_unpacklo_epi32( s[37], s[39] ); + d[39] = _mm_unpackhi_epi32( s[37], s[39] ); + + d[40] = _mm_unpacklo_epi32( s[40], s[42] ); + d[41] = _mm_unpackhi_epi32( s[40], s[42] ); + d[42] = _mm_unpacklo_epi32( s[41], s[43] ); + d[43] = _mm_unpackhi_epi32( s[41], s[43] ); + d[44] = _mm_unpacklo_epi32( s[44], s[46] ); + d[45] = _mm_unpackhi_epi32( s[44], s[46] ); + d[46] = _mm_unpacklo_epi32( s[45], s[47] ); + d[47] = _mm_unpackhi_epi32( s[45], s[47] ); + + d[48] = _mm_unpacklo_epi32( s[48], s[50] ); + d[49] = _mm_unpackhi_epi32( s[48], s[50] ); + d[50] = _mm_unpacklo_epi32( s[49], s[51] ); + d[51] = _mm_unpackhi_epi32( s[49], s[51] ); + d[52] = _mm_unpacklo_epi32( s[52], s[54] ); + d[53] = _mm_unpackhi_epi32( s[52], s[54] ); + d[54] = _mm_unpacklo_epi32( s[53], s[55] ); + d[55] = _mm_unpackhi_epi32( s[53], s[55] ); + + d[56] = _mm_unpacklo_epi32( s[56], s[58] ); + d[57] = _mm_unpackhi_epi32( s[56], s[58] ); + d[58] = _mm_unpacklo_epi32( s[57], s[59] ); + d[59] = _mm_unpackhi_epi32( s[57], s[59] ); + d[60] = _mm_unpacklo_epi32( s[60], s[62] ); + d[61] = _mm_unpackhi_epi32( s[60], s[62] ); + d[62] = _mm_unpacklo_epi32( s[61], s[63] ); + d[63] = _mm_unpackhi_epi32( s[61], s[63] ); +} + + + /* #define RLEAVE_4x32_4x64(i) do \ { \ @@ -2225,7 +2316,6 @@ static inline void rintrlv_4x32_4x64( void *dst, // 2x128 -> 4x64 - static inline void rintrlv_2x128_4x64( void *dst, const void *src0, const void *src1, const int bit_len ) { @@ -2268,7 +2358,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s1[14], s1[15] ); } - /* #define RLEAVE_2x128_4x64( i ) do \ { \ @@ -2339,7 +2428,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, d1[15] = _mm_unpackhi_epi64( s[29], s[31] ); } - /* #define RLEAVE_4x64_2x128( i ) do \ { \ @@ -2364,6 +2452,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, } */ +// 2x128 -> 8x64 + +static inline void rintrlv_4x128_8x64( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s0 = (const __m128i*)src0; + const __m128i *s1 = (const __m128i*)src1; + + d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] ); + d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] ); + d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] ); + d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] ); + d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] ); + d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] ); + d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] ); + d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] ); + + d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] ); + d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] ); + d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] ); + d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] ); + d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] ); + d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] ); + d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] ); + d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] ); + d[17] = _mm_unpacklo_epi64( s0[10], s0[11] ); + d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] ); + d[19] = _mm_unpacklo_epi64( s1[10], s1[11] ); + d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] ); + d[21] = _mm_unpackhi_epi64( s0[10], s0[11] ); + d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] ); + d[23] = _mm_unpackhi_epi64( s1[10], s1[11] ); + + d[24] = _mm_unpacklo_epi64( s0[12], s0[13] ); + d[25] = _mm_unpacklo_epi64( s0[14], s0[15] ); + d[26] = _mm_unpacklo_epi64( s1[12], s1[13] ); + d[27] = _mm_unpacklo_epi64( s1[14], s1[15] ); + d[28] = _mm_unpackhi_epi64( s0[12], s0[13] ); + d[29] = _mm_unpackhi_epi64( s0[14], s0[15] ); + d[30] = _mm_unpackhi_epi64( s1[12], s1[13] ); + d[31] = _mm_unpackhi_epi64( s1[14], s1[15] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi64( s0[16], s0[17] ); + d[33] = _mm_unpacklo_epi64( s0[18], s0[19] ); + d[34] = _mm_unpacklo_epi64( s1[16], s1[17] ); + d[35] = _mm_unpacklo_epi64( s1[18], s1[19] ); + d[36] = _mm_unpackhi_epi64( s0[16], s0[17] ); + d[37] = _mm_unpackhi_epi64( s0[18], s0[19] ); + d[38] = _mm_unpackhi_epi64( s1[16], s1[17] ); + d[39] = _mm_unpackhi_epi64( s1[18], s1[19] ); + + d[40] = _mm_unpacklo_epi64( s0[20], s0[21] ); + d[41] = _mm_unpacklo_epi64( s0[22], s0[23] ); + d[42] = _mm_unpacklo_epi64( s1[20], s1[21] ); + d[43] = _mm_unpacklo_epi64( s1[22], s1[23] ); + d[44] = _mm_unpackhi_epi64( s0[20], s0[21] ); + d[45] = _mm_unpackhi_epi64( s0[22], s0[23] ); + d[46] = _mm_unpackhi_epi64( s1[20], s1[21] ); + d[47] = _mm_unpackhi_epi64( s1[22], s1[23] ); + + d[48] = _mm_unpacklo_epi64( s0[24], s0[25] ); + d[49] = _mm_unpacklo_epi64( s0[26], s0[27] ); + d[50] = _mm_unpacklo_epi64( s1[24], s1[25] ); + d[51] = _mm_unpacklo_epi64( s1[26], s1[27] ); + d[52] = _mm_unpackhi_epi64( s0[24], s0[25] ); + d[53] = _mm_unpackhi_epi64( s0[26], s0[27] ); + d[54] = _mm_unpackhi_epi64( s1[24], s1[25] ); + d[55] = _mm_unpackhi_epi64( s1[26], s1[27] ); + + d[56] = _mm_unpacklo_epi64( s0[28], s0[29] ); + d[57] = _mm_unpacklo_epi64( s0[30], s0[31] ); + d[58] = _mm_unpacklo_epi64( s1[28], s1[29] ); + d[59] = _mm_unpacklo_epi64( s1[30], s1[31] ); + d[60] = _mm_unpackhi_epi64( s0[28], s0[29] ); + d[61] = _mm_unpackhi_epi64( s0[30], s0[31] ); + d[62] = _mm_unpackhi_epi64( s1[28], s1[29] ); + d[63] = _mm_unpackhi_epi64( s1[30], s1[31] ); +} + +// 8x64 -> 4x128 + +static inline void rintrlv_8x64_4x128( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + const __m128i* s = (const __m128i*)src; + + d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] ); + d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] ); + d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] ); + d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] ); + d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] ); + d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] ); + d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] ); + d0[10] = _mm_unpacklo_epi64( s[17], s[21] ); + d0[11] = _mm_unpackhi_epi64( s[17], s[21] ); + d1[10] = _mm_unpacklo_epi64( s[19], s[23] ); + d1[11] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[12] = _mm_unpacklo_epi64( s[24], s[28] ); + d0[13] = _mm_unpackhi_epi64( s[24], s[28] ); + d1[12] = _mm_unpacklo_epi64( s[26], s[30] ); + d1[13] = _mm_unpackhi_epi64( s[26], s[30] ); + d0[14] = _mm_unpacklo_epi64( s[25], s[29] ); + d0[15] = _mm_unpackhi_epi64( s[25], s[29] ); + d1[14] = _mm_unpacklo_epi64( s[27], s[31] ); + d1[15] = _mm_unpackhi_epi64( s[27], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[16] = _mm_unpacklo_epi64( s[32], s[36] ); + d0[17] = _mm_unpackhi_epi64( s[32], s[36] ); + d1[16] = _mm_unpacklo_epi64( s[34], s[38] ); + d1[17] = _mm_unpackhi_epi64( s[34], s[38] ); + d0[18] = _mm_unpacklo_epi64( s[33], s[37] ); + d0[19] = _mm_unpackhi_epi64( s[33], s[37] ); + d1[18] = _mm_unpacklo_epi64( s[35], s[39] ); + d1[19] = _mm_unpackhi_epi64( s[35], s[39] ); + + d0[20] = _mm_unpacklo_epi64( s[40], s[44] ); + d0[21] = _mm_unpackhi_epi64( s[40], s[44] ); + d1[20] = _mm_unpacklo_epi64( s[42], s[46] ); + d1[21] = _mm_unpackhi_epi64( s[42], s[46] ); + d0[22] = _mm_unpacklo_epi64( s[41], s[45] ); + d0[23] = _mm_unpackhi_epi64( s[41], s[45] ); + d1[22] = _mm_unpacklo_epi64( s[43], s[47] ); + d1[23] = _mm_unpackhi_epi64( s[43], s[47] ); + + d0[24] = _mm_unpacklo_epi64( s[48], s[52] ); + d0[25] = _mm_unpackhi_epi64( s[48], s[52] ); + d1[24] = _mm_unpacklo_epi64( s[50], s[54] ); + d1[25] = _mm_unpackhi_epi64( s[50], s[54] ); + d0[26] = _mm_unpacklo_epi64( s[49], s[53] ); + d0[27] = _mm_unpackhi_epi64( s[49], s[53] ); + d1[26] = _mm_unpacklo_epi64( s[51], s[55] ); + d1[27] = _mm_unpackhi_epi64( s[51], s[55] ); + + d0[28] = _mm_unpacklo_epi64( s[56], s[60] ); + d0[29] = _mm_unpackhi_epi64( s[56], s[60] ); + d1[28] = _mm_unpacklo_epi64( s[58], s[62] ); + d1[29] = _mm_unpackhi_epi64( s[58], s[62] ); + d0[30] = _mm_unpacklo_epi64( s[57], s[61] ); + d0[31] = _mm_unpackhi_epi64( s[57], s[61] ); + d1[30] = _mm_unpacklo_epi64( s[59], s[63] ); + d1[31] = _mm_unpackhi_epi64( s[59], s[63] ); +} + +// 8x64 -> 2x256 + +static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, const int bit_len ) +{ + __m128i *d0 = (__m128i*)dst0; + __m128i *d1 = (__m128i*)dst1; + __m128i *d2 = (__m128i*)dst2; + __m128i *d3 = (__m128i*)dst3; + const __m128i* s = (const __m128i*)src; + + d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] ); + d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] ); + d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] ); + d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] ); + d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); + d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] ); + d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); + d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] ); + + d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); + d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] ); + d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); + d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] ); + d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] ); + d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] ); + d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] ); + d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] ); + + if ( bit_len <= 256 ) return; + + d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] ); + d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] ); + d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] ); + d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] ); + d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] ); + d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] ); + d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] ); + d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] ); + + d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] ); + d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] ); + d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] ); + d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] ); + d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] ); + d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] ); + d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] ); + d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] ); + + if ( bit_len <= 512 ) return; + + d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] ); + d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] ); + d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] ); + d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] ); + d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] ); + d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] ); + d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] ); + d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] ); + + d0[10] = _mm_unpacklo_epi64( s[40], s[44] ); + d1[10] = _mm_unpackhi_epi64( s[40], s[44] ); + d2[10] = _mm_unpacklo_epi64( s[41], s[45] ); + d3[10] = _mm_unpackhi_epi64( s[41], s[45] ); + d0[11] = _mm_unpacklo_epi64( s[42], s[46] ); + d1[11] = _mm_unpackhi_epi64( s[42], s[46] ); + d2[11] = _mm_unpacklo_epi64( s[43], s[47] ); + d3[11] = _mm_unpackhi_epi64( s[43], s[47] ); + + d0[12] = _mm_unpacklo_epi64( s[48], s[52] ); + d1[12] = _mm_unpackhi_epi64( s[48], s[52] ); + d2[12] = _mm_unpacklo_epi64( s[49], s[53] ); + d3[12] = _mm_unpackhi_epi64( s[49], s[53] ); + d0[13] = _mm_unpacklo_epi64( s[50], s[54] ); + d1[13] = _mm_unpackhi_epi64( s[50], s[54] ); + d2[13] = _mm_unpacklo_epi64( s[51], s[55] ); + d3[13] = _mm_unpackhi_epi64( s[51], s[55] ); + + d0[14] = _mm_unpacklo_epi64( s[56], s[60] ); + d1[14] = _mm_unpackhi_epi64( s[56], s[60] ); + d2[14] = _mm_unpacklo_epi64( s[57], s[61] ); + d3[14] = _mm_unpackhi_epi64( s[57], s[61] ); + d0[15] = _mm_unpacklo_epi64( s[58], s[62] ); + d1[15] = _mm_unpackhi_epi64( s[58], s[62] ); + d2[15] = _mm_unpacklo_epi64( s[59], s[63] ); + d3[15] = _mm_unpackhi_epi64( s[59], s[63] ); +} + +// 4x128 -> 8x64 + +static inline void rintrlv_2x256_8x64( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const int bit_len ) +{ + __m128i *d = (__m128i*)dst; + __m128i *s0 = (__m128i*)src0; + __m128i *s1 = (__m128i*)src1; + __m128i *s2 = (__m128i*)src2; + __m128i *s3 = (__m128i*)src3; + + d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] ); + d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] ); + d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] ); + d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] ); + d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] ); + d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] ); + d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] ); + d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] ); + + d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] ); + d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] ); + d[10] = _mm_unpacklo_epi64( s2[1], s2[3] ); + d[11] = _mm_unpacklo_epi64( s3[1], s3[3] ); + d[12] = _mm_unpackhi_epi64( s0[1], s0[3] ); + d[13] = _mm_unpackhi_epi64( s1[1], s1[3] ); + d[14] = _mm_unpackhi_epi64( s2[1], s2[3] ); + d[15] = _mm_unpackhi_epi64( s3[1], s3[3] ); + + if ( bit_len <= 256 ) return; + + d[16] = _mm_unpacklo_epi64( s0[4], s0[6] ); + d[17] = _mm_unpacklo_epi64( s1[4], s1[6] ); + d[18] = _mm_unpacklo_epi64( s2[4], s2[6] ); + d[19] = _mm_unpacklo_epi64( s3[4], s3[6] ); + d[20] = _mm_unpackhi_epi64( s0[4], s0[6] ); + d[21] = _mm_unpackhi_epi64( s1[4], s1[6] ); + d[22] = _mm_unpackhi_epi64( s2[4], s2[6] ); + d[23] = _mm_unpackhi_epi64( s3[4], s3[6] ); + + d[24] = _mm_unpacklo_epi64( s0[5], s0[7] ); + d[25] = _mm_unpacklo_epi64( s1[5], s1[7] ); + d[26] = _mm_unpacklo_epi64( s2[5], s2[7] ); + d[27] = _mm_unpacklo_epi64( s3[5], s3[7] ); + d[28] = _mm_unpackhi_epi64( s0[5], s0[7] ); + d[29] = _mm_unpackhi_epi64( s1[5], s1[7] ); + d[30] = _mm_unpackhi_epi64( s2[5], s2[7] ); + d[31] = _mm_unpackhi_epi64( s3[5], s3[7] ); + + if ( bit_len <= 512 ) return; + + d[32] = _mm_unpacklo_epi64( s0[8], s0[10] ); + d[33] = _mm_unpacklo_epi64( s1[8], s1[10] ); + d[34] = _mm_unpacklo_epi64( s2[8], s2[10] ); + d[35] = _mm_unpacklo_epi64( s3[8], s3[10] ); + d[36] = _mm_unpackhi_epi64( s0[8], s0[10] ); + d[37] = _mm_unpackhi_epi64( s1[8], s1[10] ); + d[38] = _mm_unpackhi_epi64( s2[8], s2[10] ); + d[39] = _mm_unpackhi_epi64( s3[8], s3[10] ); + + d[40] = _mm_unpacklo_epi64( s0[9], s0[11] ); + d[41] = _mm_unpacklo_epi64( s1[9], s1[11] ); + d[42] = _mm_unpacklo_epi64( s2[9], s2[11] ); + d[43] = _mm_unpacklo_epi64( s3[9], s3[11] ); + d[44] = _mm_unpackhi_epi64( s0[9], s0[11] ); + d[45] = _mm_unpackhi_epi64( s1[9], s1[11] ); + d[46] = _mm_unpackhi_epi64( s2[9], s2[11] ); + d[47] = _mm_unpackhi_epi64( s3[9], s3[11] ); + + d[48] = _mm_unpacklo_epi64( s0[12], s0[14] ); + d[49] = _mm_unpacklo_epi64( s1[12], s1[14] ); + d[50] = _mm_unpacklo_epi64( s2[12], s2[14] ); + d[51] = _mm_unpacklo_epi64( s3[12], s3[14] ); + d[52] = _mm_unpackhi_epi64( s0[12], s0[14] ); + d[53] = _mm_unpackhi_epi64( s1[12], s1[14] ); + d[54] = _mm_unpackhi_epi64( s2[12], s2[14] ); + d[55] = _mm_unpackhi_epi64( s3[12], s3[14] ); + + d[56] = _mm_unpacklo_epi64( s0[13], s0[15] ); + d[57] = _mm_unpacklo_epi64( s1[13], s1[15] ); + d[58] = _mm_unpacklo_epi64( s2[13], s2[15] ); + d[59] = _mm_unpacklo_epi64( s3[13], s3[15] ); + d[60] = _mm_unpackhi_epi64( s0[13], s0[15] ); + d[61] = _mm_unpackhi_epi64( s1[13], s1[15] ); + d[62] = _mm_unpackhi_epi64( s2[13], s2[15] ); + d[63] = _mm_unpackhi_epi64( s3[13], s3[15] ); +} + // // Some functions customized for mining. diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index fae6203..6e32965 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #else - #define mm128_ror_64 mm128_ror_var_64 #define mm128_rol_64 mm128_rol_var_64 #define mm128_ror_32 mm128_ror_var_32 @@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) +// Rotate 16 byte (128 bit) vector by c bytes. +// Less efficient using shift but more versatile. Use only for odd number +// byte rotations. Use shuffle above whenever possible. +#define mm128_ror_x8( v, c ) \ + _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ) + +#define mm128_rol_x8( v, c ) \ + _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ) + #if defined (__SSE3__) // no SSE2 implementation, no current users @@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_1x8( v ) \ _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \ 0x060504030201000f ) ) -#endif // SSE3 +#else // SSE2 -// Rotate 16 byte (128 bit) vector by c bytes. -// Less efficient using shift but more versatile. Use only for odd number -// byte rotations. Use shuffle above whenever possible. -#define mm128_bror( v, c ) \ - _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ) +#define mm128_ror_1x16( v ) \ + _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) ) -#define mm128_brol( v, c ) \ - _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ) +#define mm128_rol_1x16( v ) \ + _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) ) +#define mm128_ror_1x8( v ) \ + _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) ) + +#define mm128_rol_1x8( v ) \ + _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) ) + +#endif // SSE3 else SSE2 // Invert vector: {3,2,1,0} -> {0,1,2,3} #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b ) @@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // // Rotate elements within lanes. -#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 ) +#define mm128_swap_64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) -#define mm128_ror16_64( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \ - 0x0100070605040302 ) +#define mm128_rol64_8( v, c ) \ + _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \ + _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) -#define mm128_rol16_64( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \ - 0x0504030201000706 ) +#define mm128_ror64_8( v, c ) \ + _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \ + _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) -#define mm128_swap16_32( v ) \ - _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \ - 0x0504070601000302 ) +#define mm128_rol32_8( v, c ) \ + _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \ + _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) ) + +#define mm128_ror32_8( v, c ) \ + _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \ + _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) ) + // // Endian byte swap. @@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) // Swap 128 bit vectorse. -#define mm128_swap128_256( v1, v2 ) \ +#define mm128_swap256_128( v1, v2 ) \ v1 = _mm_xor_si128( v1, v2 ); \ v2 = _mm_xor_si128( v1, v2 ); \ v1 = _mm_xor_si128( v1, v2 ); + // Concatenate v1 & v2 and rotate as one 256 bit vector. #if defined(__SSE4_1__) -#define mm128_ror1x64_256( v1, v2 ) \ +#define mm128_ror256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v1 = _mm_alignr_epi8( v2, v1, 8 ); \ v2 = t; \ } while(0) -#define mm128_rol1x64_256( v1, v2 ) \ +#define mm128_rol256_64( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v2 = _mm_alignr_epi8( v2, v1, 8 ); \ v1 = t; \ } while(0) -#define mm128_ror1x32_256( v1, v2 ) \ +#define mm128_ror256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ v1 = _mm_alignr_epi8( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm128_rol1x32_256( v1, v2 ) \ +#define mm128_rol256_32( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ v2 = _mm_alignr_epi8( v2, v1, 12 ); \ v1 = t; \ } while(0) -#define mm128_ror1x16_256( v1, v2 ) \ +#define mm128_ror256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ v1 = _mm_alignr_epi8( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm128_rol1x16_256( v1, v2 ) \ +#define mm128_rol256_16( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ v2 = _mm_alignr_epi8( v2, v1, 14 ); \ v1 = t; \ } while(0) -#define mm128_ror1x8_256( v1, v2 ) \ +#define mm128_ror256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ v1 = _mm_alignr_epi8( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm128_rol1x8_256( v1, v2 ) \ +#define mm128_rol256_8( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ v2 = _mm_alignr_epi8( v2, v1, 15 ); \ @@ -497,7 +515,7 @@ do { \ #else // SSE2 -#define mm128_ror1x64_256( v1, v2 ) \ +#define mm128_ror256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ _mm_slli_si128( v2, 8 ) ); \ @@ -506,7 +524,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol1x64_256( v1, v2 ) \ +#define mm128_rol256_64( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ _mm_srli_si128( v2, 8 ) ); \ @@ -515,7 +533,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror1x32_256( v1, v2 ) \ +#define mm128_ror256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ _mm_slli_si128( v2, 12 ) ); \ @@ -524,7 +542,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol1x32_256( v1, v2 ) \ +#define mm128_rol256_32( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ _mm_srli_si128( v2, 12 ) ); \ @@ -533,7 +551,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror1x16_256( v1, v2 ) \ +#define mm128_ror256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ _mm_slli_si128( v2, 14 ) ); \ @@ -542,7 +560,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol1x16_256( v1, v2 ) \ +#define mm128_rol256_16( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ _mm_srli_si128( v2, 14 ) ); \ @@ -551,7 +569,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_ror1x8_256( v1, v2 ) \ +#define mm128_ror256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ _mm_slli_si128( v2, 15 ) ); \ @@ -560,7 +578,7 @@ do { \ v1 = t; \ } while(0) -#define mm128_rol1x8_256( v1, v2 ) \ +#define mm128_rol256_8( v1, v2 ) \ do { \ __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ _mm_srli_si128( v2, 15 ) ); \ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index ac7bef2..3bdde9b 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // -// Rotate elements within lanes of 256 bit vector. +// Rotate elements within each 128 bit lane of 256 bit vector. -// Swap 64 bit elements in each 128 bit lane. -#define mm256_swap64_128( v ) _mm256_shuffle_epi32( v, 0x4e ) +#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) -// Rotate each 128 bit lane by one 32 bit element. -#define mm256_ror1x32_128( v ) _mm256_shuffle_epi32( v, 0x39 ) -#define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 ) +#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 ) -#define mm256_ror1x16_128( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \ - 0x01000f0e0d0c0b0a, 0x0908070605040302 ) ) +#define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 ) -#define mm256_rol1x16_128( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \ - 0x0d0c0b0a09080706, 0x0504030201000f0e ) ) - -#define mm256_ror1x8_128( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \ - 0x000f0e0d0c0b0a09, 0x0807060504030201 ) ) - -#define mm256_rol1x8_128( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \ - 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) - -// Rotate each 128 bit lane by c bytes. -#define mm256_bror_128( v, c ) \ +// Rotave each 128 bit lane by c elements. +#define mm256_ror128_8( v, c ) \ _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \ _mm256_bslli_epi128( v, 16-(c) ) ) -#define mm256_brol_128( v, c ) \ +#define mm256_rol128_8( v, c ) \ _mm256_or_si256( _mm256_bslli_epi128( v, c ), \ _mm256_bsrli_epi128( v, 16-(c) ) ) -// Swap 32 bit elements in each 64 bit lane -#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 ) + +// Rotate elements in each 64 bit lane + +#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -#define mm256_rol1x16_64( v ) _mm256_rol_epi64( v, 16 ) -#define mm256_ror1x16_64( v ) _mm256_ror_epi64( v, 16 ) +#define mm256_rol64_8( v, c ) _mm256_rol_epi64( v, ((c)<<3) ) +#define mm256_ror64_8( v, c ) _mm256_ror_epi64( v, ((c)<<3) ) #else -#define mm256_ror1x16_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \ - 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) +#define mm256_rol64_8( v, c ) \ + _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \ + _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) + +#define mm256_ror64_8( v, c ) \ + _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \ + _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) ) -#define mm256_rol1x16_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \ - 0x0d0c0b0a09080f0e, 0x0504030201000706 ) ) #endif -#define mm256_ror1x8_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \ - 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) -#define mm256_rol1x8_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \ - 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) - -#define mm256_ror3x8_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \ - 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) - -#define mm256_rol3x8_64( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \ - 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) - - -// Swap 16 bit elements in each 32 bit lane +// Rotate elements in each 32 bit lane #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -#define mm256_swap16_32( v ) _mm256_rol_epi32( v, 16 ) +#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 ) + +#define mm256_rol32_8( v ) _mm256_rol_epi32( v, 8 ) +#define mm256_ror32_8( v ) _mm256_ror_epi32( v, 8 ) #else -#define mm256_swap16_32( v ) \ - _mm256_shuffle_epi8( v, \ - m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \ - 0x0b0a09080f0e0d0c, 0x0302010007060504 ) ) +#define mm256_swap32_16( v ) \ + _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \ + _mm256_srli_epi32( v, 16 ) ) + +#define mm256_rol32_8( v ) \ + _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \ + _mm256_srli_epi32( v, 8 ) ) + +#define mm256_ror32_8( v, c ) \ + _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \ + _mm256_slli_epi32( v, 8 ) ) + #endif + // // Swap bytes in vector elements, endian bswap. #define mm256_bswap_64( v ) \ @@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also // makes these macros unnecessary. -#define mm256_swap256_512 (v1, v2) \ - v1 = _mm256_xor_si256(v1, v2); \ - v2 = _mm256_xor_si256(v1, v2); \ - v1 = _mm256_xor_si256(v1, v2); +#define mm256_swap512_256( v1, v2 ) \ + v1 = _mm256_xor_si256( v1, v2 ); \ + v2 = _mm256_xor_si256( v1, v2 ); \ + v1 = _mm256_xor_si256( v1, v2 ); -#define mm256_ror1x128_512( v1, v2 ) \ +#define mm256_ror512_128( v1, v2 ) \ do { \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \ v2 = t; \ } while(0) -#define mm256_rol1x128_512( v1, v2 ) \ +#define mm256_rol512_128( v1, v2 ) \ do { \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \ v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 58ae8cb..a4e0807 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -15,13 +15,13 @@ // AVX512 intrinsics have a few changes from previous conventions. // -// Some instructions like cmp and blend use the mask regsiters now instead -// a vector mask. +// cmp instruction now returns a bitmask isnstead of a vector mask. +// This eliminates the need for the blendv instruction. // -// The new rotate instructions require the count to be only an 8 bit -// immediate value. The documentation is the same as for shift and -// it allows variables. Suspect a compiler issue but it still happens -// in GCC9. +// The new rotate instructions require the count to be an 8 bit +// immediate value only. Compilation fails if a variable is used. +// The documentation is the same as for shift and it works with +// variables. // // _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute // usually shuffles accross all lanes. @@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, #define m512_const2_64( i1, i0 ) \ m512_const1_128( m128_const_64( i1, i0 ) ) +#define m512_const2_32( i1, i0 ) \ + m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \ + | ( (uint64_t)(i0) & 0xffffffff ) ) ) + + static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, const uint64_t i1, const uint64_t i0 ) { @@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ 0x28292a2b2c2d2e2f, 0x2021222324252627, \ 0x18191a1b1c1d1e1f, 0x1011121314151617, \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 )) + 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) #define mm512_bswap_32( v ) \ _mm512_shuffle_epi8( v, \ @@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) { \ __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ 0x2c2d2e2f28292a2b, 0x2425262720212223, \ - 0x0c0d0e0f08090a0b, 0x0405060700010203, \ - 0x1c1d1e1f18191a1b, 0x1415161710111213 ); \ + 0x1c1d1e1f18191a1b, 0x1415161710111213, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ @@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Rotate elements in 512 bit vector. + #define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 ) +// 1x64 notation used to disinguish from bit rotation. #define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 ) #define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 ) @@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Rotate elements within 256 bit lanes of 512 bit vector. +// Rename these for consistency. Element size is always last. +// mm__ + + // Swap hi & lo 128 bits in each 256 bit lane -#define mm512_swap128_256( v ) _mm512_permutex_epi64( v, 0x4e ) + +#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) // Rotate 256 bit lanes by one 64 bit element -#define mm512_ror1x64_256( v ) _mm512_permutex_epi64( v, 0x39 ) -#define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 ) + +#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 ) +#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 ) // Rotate 256 bit lanes by one 32 bit element -#define mm512_ror1x32_256( v ) \ + +#define mm512_ror256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x000000080000000f, 0x0000000e0000000d, \ 0x0000000c0000000b, 0x0000000a00000009, \ 0x0000000000000007, 0x0000000600000005, \ 0x0000000400000003, 0x0000000200000001 ), v ) -#define mm512_rol1x32_256( v ) \ +#define mm512_rol256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ 0x0000000e0000000d, 0x0000000c0000000b, \ 0x0000000a00000009, 0x000000080000000f, \ 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ), v ) -#define mm512_ror1x16_256( v ) \ +#define mm512_ror256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x00100001001e001d, 0x001c001b001a0019, \ 0x0018001700160015, 0x0014001300120011, \ 0x0000000f000e000d, 0x000c000b000a0009, \ 0x0008000700060005, 0x0004000300020001 ), v ) -#define mm512_rol1x16_256( v ) \ +#define mm512_rol256_16( v ) \ _mm512_permutexvar_epi16( m512_const_64( \ 0x001e001d001c001b, 0x001a001900180017, \ 0x0016001500140013, 0x001200110010001f, \ 0x000e000d000c000b, 0x000a000900080007, \ 0x0006000500040003, 0x000200010000000f ), v ) -#define mm512_ror1x8_256( v ) \ +#define mm512_ror256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x203f3e3d3c3b3a39, 0x3837363534333231, \ 0x302f2e2d2c2b2a29, 0x2827262524232221, \ 0x001f1e1d1c1b1a19, 0x1817161514131211, \ 0x100f0e0d0c0b0a09, 0x0807060504030201 ), v ) -#define mm512_rol1x8_256( v ) \ +#define mm512_rol256_8( v ) \ _mm512_shuffle_epi8( v, m512_const_64( \ 0x3e3d3c3b3a393837, 0x363534333231302f, \ 0x2e2d2c2b2a292827, 0x262524232221203f, \ @@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // Rotate elements within 128 bit lanes of 512 bit vector. // Swap hi & lo 64 bits in each 128 bit lane -#define mm512_swap64_128( v ) _mm512_shuffle_epi32( v, 0x4e ) +#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) // Rotate 128 bit lanes by one 32 bit element -#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 ) -#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 ) +#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) +#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) -#define mm512_ror1x16_128( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x0018001f001e001d, 0x001c001b001a0019, \ - 0x0010001700160015, 0x0014001300120011, \ - 0x0008000f000e000d, 0x000c000b000a0009, \ - 0x0000000700060005, 0x0004000300020001 ), v ) -#define mm512_rol1x16_128( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x001e001d001c001b, 0x001a00190018001f, \ - 0x0016001500140013, 0x0012001100100017, \ - 0x000e000d000c000b, 0x000a00090008000f, \ - 0x0006000500040003, 0x0002000100000007 ), v ) - -#define mm512_ror1x8_128( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x303f3e3d3c3b3a39, 0x3837363534333231, \ - 0x202f2e2d2c2b2a29, 0x2827262524232221, \ - 0x101f1e1d1c1b1a19, 0x1817161514131211, \ - 0x000f0e0d0c0b0a09, 0x0807060504030201 ) ) - -#define mm512_rol1x8_128( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x3e3d3c3b3a393837, 0x363534333231303f, \ - 0x2e2d2c2b2a292827, 0x262524232221202f, \ - 0x1e1d1c1b1a191817, 0x161514131211101f, \ - 0x0e0d0c0b0a090807, 0x060504030201000f ) ) - -// Rotate 128 bit lanes by c bytes. -#define mm512_bror_128( v, c ) \ +// Rotate 128 bit lanes by c bytes, faster than building that monstrous +// constant above. +#define mm512_ror128_8( v, c ) \ _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \ _mm512_bslli_epi128( v, 16-(c) ) ) -#define mm512_brol_128( v, c ) \ +#define mm512_rol128_8( v, c ) \ _mm512_or_si512( _mm512_bslli_epi128( v, c ), \ _mm512_bsrli_epi128( v, 16-(c) ) ) @@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Rotate elements within 64 bit lanes. +#define mm512_rol64_x8( v, c ) _mm512_rol_epi64( v, ((c)<<3) ) +#define mm512_ror64_x8( v, c ) _mm512_ror_epi64( v, ((c)<<3) ) + // Swap 32 bit elements in each 64 bit lane -#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 ) +#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) // Rotate each 64 bit lane by one 16 bit element. -#define mm512_ror1x16_64( v ) _mm512_ror_epi64( v, 16 ) -#define mm512_rol1x16_64( v ) _mm512_rol_epi64( v, 16 ) -#define mm512_ror1x8_64( v ) _mm512_ror_epi64( v, 8 ) -#define mm512_rol1x8_64( v ) _mm512_rol_epi64( v, 8 ) - -/* -#define mm512_ror1x16_64( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x001c001f001e001d, 0x0018001b001a0019, \ - 0x0014001700160015, 0x0010001300120011, \ - 0x000c000f000e000d, 0x0008000b000a0009, \ - 0x0004000700060005, 0x0000000300020001, v ) - -#define mm512_rol1x16_64( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x001e001d001c001f, 0x001a00190018001b, \ - 0x0016001500140017, 0x0012001100100013, \ - 0x000e000d000c000f, 0x000a00090008000b, \ - 0x0006000500040007, 0x0002000100000003, v ) - -// Rotate each 64 bit lane by one byte. -#define mm512_ror1x8_64( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x383F3E3D3C3B3A39, 0x3037363534333231, \ - 0x282F2E2D2C2B2A29, 0x2027262524232221, \ - 0x181F1E1D1C1B1A19, 0x1017161514131211, \ - 0x080F0E0D0C0B0A09, 0x0007060504030201 ) ) -#define mm512_rol1x8_64( v ) \ - _mm512_shuffle( v, m512_const_64( \ - 0x3E3D3C3B3A39383F, 0x3635343332313037, \ - 0x2E2D2C2B2A29282F, 0x2625242322212027, \ - 0x1E1D1C1B1A19181F, 0x1615141312111017, \ - 0x0E0D0C0B0A09080F, 0x0605040302010007 ) ) -*/ +#define mm512_ror64_16( v ) _mm512_ror_epi64( v, 16 ) +#define mm512_rol64_16( v ) _mm512_rol_epi64( v, 16 ) +#define mm512_ror64_8( v ) _mm512_ror_epi64( v, 8 ) +#define mm512_rol64_8( v ) _mm512_rol_epi64( v, 8 ) // // Rotate elements within 32 bit lanes. -#define mm512_swap16_32( v ) _mm512_ror_epi32( v, 16 ) -#define mm512_ror1x8_32( v ) _mm512_ror_epi32( v, 8 ) -#define mm512_rol1x8_32( v ) _mm512_rol_epi32( v, 8 ) - -/* -#define mm512_swap16_32( v ) \ - _mm512_permutexvar_epi16( m512_const_64( \ - 0x001e001f001c001d, 0x001a001b00180019, \ - 0x0016001700140015, 0x0012001300100011, \ - 0x000e000f000c000d, 0x000a000b00080009, \ - 0x0006000700040005, 0x0002000300000001 ), v ) - -#define mm512_ror1x8_32( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x3C3F3E3D383B3A39, 0x3437363530333231, \ - 0x2C2F2E2D282B2A29, 0x2427262520232221, \ - 0x1C1F1E1D181B1A19, 0x1417161510131211, \ - 0x0C0F0E0D080B0A09, 0x0407060500030201 )) - -#define mm512_rol1x8_32( v ) \ - _mm512_shuffle_epi8( v, m512_const_64( \ - 0x3E3D3C3F3A39383B, 0x3635343732313033, \ - 0x2E2D2C2F2A29282B, 0x2625242722212023, \ - 0x1E1D1C1F1A19181B, 0x1615141712111013, \ - 0x0E0D0C0F0A09080B, 0x0605040702010003 ) ) -*/ - +#define mm512_rol32_x8( v, c ) _mm512_rol_epi32( v, ((c)<<2) ) +#define mm512_ror32_x8( v, c ) _mm512_ror_epi32( v, ((c)<<2) ) // @@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // These can all be done with 2 permutex2var instructions but they are // slower than either xor or alignr and require AVX512VBMI. -#define mm512_swap512_1024(v1, v2) \ +#define mm512_swap1024_512(v1, v2) \ v1 = _mm512_xor_si512(v1, v2); \ v2 = _mm512_xor_si512(v1, v2); \ v1 = _mm512_xor_si512(v1, v2); -#define mm512_ror1x256_1024( v1, v2 ) \ +#define mm512_ror1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v1 = _mm512_alignr_epi64( v2, v1, 4 ); \ v2 = t; \ } while(0) -#define mm512_rol1x256_1024( v1, v2 ) \ +#define mm512_rol1024_256( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \ v2 = _mm512_alignr_epi64( v2, v1, 4 ); \ v1 = t; \ } while(0) -#define mm512_ror1x128_1024( v1, v2 ) \ +#define mm512_ror1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \ v1 = _mm512_alignr_epi64( v2, v1, 2 ); \ v2 = t; \ } while(0) -#define mm512_rol1x128_1024( v1, v2 ) \ +#define mm512_rol1024_128( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \ v2 = _mm512_alignr_epi64( v2, v1, 6 ); \ v1 = t; \ } while(0) -#define mm512_ror1x64_1024( v1, v2 ) \ +#define mm512_ror1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \ v1 = _mm512_alignr_epi64( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1x64_1024( v1, v2 ) \ +#define mm512_rol1024_64( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \ v2 = _mm512_alignr_epi64( v2, v1, 7 ); \ v1 = t; \ } while(0) -#define mm512_ror1x32_1024( v1, v2 ) \ +#define mm512_ror1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \ v1 = _mm512_alignr_epi32( v2, v1, 1 ); \ v2 = t; \ } while(0) -#define mm512_rol1x32_1024( v1, v2 ) \ +#define mm512_rol1024_32( v1, v2 ) \ do { \ __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \ v2 = _mm512_alignr_epi32( v2, v1, 15 ); \