From 3572cb53c49d650cdc6407fd605f2d6e20609c1b Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 2 Jan 2020 23:54:08 -0500 Subject: [PATCH] v3.11.0 --- Makefile.am | 4 + RELEASE_NOTES | 17 +- algo-gate-api.c | 1 + algo/blake/blake-hash-4way.h | 2 +- algo/blake/blake256-hash-4way.c | 5 +- algo/blake/blake2s-hash-4way.h | 3 +- algo/bmw/bmw512-4way.c | 3 +- algo/cubehash/cubehash_sse2.c | 22 +- algo/echo/aes_ni/hash.c | 18 +- algo/echo/aes_ni/hash.c.test | 620 ++++++++++++++++++++++ algo/echo/echo-hash-4way.c | 398 +++----------- algo/groestl/aes_ni/groestl-intr-aes.h | 6 +- algo/groestl/aes_ni/hash-groestl.c | 1 + algo/groestl/groestl-4way.c | 64 +++ algo/groestl/groestl-gate.c | 23 + algo/groestl/groestl-gate.h | 31 ++ algo/groestl/groestl.c | 31 +- algo/groestl/groestl256-hash-4way.c | 280 ++++++++++ algo/groestl/groestl256-hash-4way.h | 121 +++++ algo/groestl/groestl256-intr-4way.h | 492 ++++++++++++++++++ algo/groestl/groestl512-hash-4way.c | 114 ++++ algo/groestl/groestl512-hash-4way.h | 94 ++++ algo/groestl/groestl512-intr-4way.h | 654 +++++++++++++++++++++++ algo/groestl/myrgr-4way.c | 155 +++++- algo/groestl/myrgr-gate.c | 10 +- algo/groestl/myrgr-gate.h | 25 +- algo/hamsi/hamsi-hash-4way.c | 3 +- algo/hamsi/hamsi-hash-4way.h | 2 +- algo/haval/haval-4way-helper.c | 2 +- algo/haval/haval-hash-4way.c | 4 +- algo/haval/haval-hash-4way.h | 2 +- algo/jh/jh-hash-4way.h | 2 - algo/jh/jha-4way.c | 6 +- algo/keccak/keccak-hash-4way.h | 2 - algo/lyra2/allium-4way.c | 34 +- algo/lyra2/lyra2-gate.c | 2 +- algo/lyra2/lyra2h-4way.c | 2 +- algo/lyra2/lyra2rev2-4way.c | 12 +- algo/lyra2/lyra2rev3-4way.c | 13 +- algo/lyra2/lyra2z-4way.c | 8 +- algo/nist5/nist5-4way.c | 8 +- algo/quark/anime-4way.c | 20 +- algo/quark/hmq1725-4way.c | 217 ++++++-- algo/quark/hmq1725-gate.c | 2 +- algo/quark/quark-4way.c | 77 ++- algo/quark/quark-gate.c | 2 +- algo/qubit/qubit-2way.c | 60 ++- algo/qubit/qubit-gate.c | 2 +- algo/ripemd/lbry-4way.c | 54 +- algo/ripemd/lbry-gate.c | 2 +- algo/ripemd/lbry-gate.h | 13 +- algo/ripemd/ripemd-hash-4way.c | 8 +- algo/ripemd/ripemd-hash-4way.h | 10 +- algo/sha/sha-hash-4way.h | 38 +- algo/sha/sha256-hash-4way.c | 298 +++++++++-- algo/sha/sha256q-4way.c | 20 +- algo/sha/sha256t-4way.c | 16 +- algo/sha/sha512-hash-4way.c | 94 ++-- algo/shabal/shabal-hash-4way.h | 2 +- algo/shavite/shavite-hash-4way.c | 399 ++++++++++++++ algo/shavite/shavite-hash-4way.h | 25 + algo/skein/skein-4way.c | 62 +-- algo/skein/skein-hash-4way.h | 4 +- algo/skein/skein2-4way.c | 4 +- algo/sm3/sm3-hash-4way.c | 258 +++++++-- algo/sm3/sm3-hash-4way.h | 27 +- algo/x11/c11-4way.c | 112 ++-- algo/x11/c11-gate.c | 2 +- algo/x11/timetravel-4way.c | 10 +- algo/x11/timetravel10-4way.c | 10 +- algo/x11/tribus-4way.c | 26 +- algo/x11/tribus-gate.c | 2 +- algo/x11/x11-4way.c | 107 +++- algo/x11/x11-gate.c | 2 +- algo/x11/x11evo-4way.c | 10 +- algo/x11/x11gost-4way.c | 114 ++-- algo/x12/x12-4way.c | 122 +++-- algo/x12/x12-gate.c | 2 +- algo/x13/phi1612-4way.c | 30 +- algo/x13/phi1612-gate.c | 2 +- algo/x13/skunk-4way.c | 2 +- algo/x13/x13-4way.c | 114 ++-- algo/x13/x13-gate.c | 2 +- algo/x13/x13bcd-4way.c | 389 ++++++++++++-- algo/x13/x13sm3-4way.c | 16 +- algo/x13/x13sm3-gate.c | 8 +- algo/x13/x13sm3-gate.h | 38 +- algo/x14/polytimos-4way.c | 4 +- algo/x14/veltor-4way.c | 4 +- algo/x14/x14-4way.c | 116 +++-- algo/x14/x14-gate.c | 2 +- algo/x15/x15-4way.c | 122 +++-- algo/x15/x15-gate.c | 2 +- algo/x16/x16r-4way.c | 178 ++++--- algo/x16/x16r-gate.c | 19 +- algo/x16/x16rt-4way.c | 82 ++- algo/x16/x16rv2-4way.c | 83 ++- algo/x16/x21s-4way.c | 183 ++++--- algo/x17/sonoa-4way.c | 417 +++++++++++++-- algo/x17/sonoa-gate.c | 2 +- algo/x17/x17-4way.c | 142 +++-- algo/x17/x17-gate.c | 2 +- algo/x17/xevan-4way.c | 135 ++++- algo/x17/xevan-gate.c | 2 +- algo/x22/x22i-4way.c | 89 +++- algo/x22/x22i-gate.c | 24 +- algo/x22/x22i-gate.h | 10 +- algo/x22/x25x-4way.c | 691 ++++++++++++++++++++++--- build-allarch.sh | 2 +- build-avx2.sh | 27 + clean-all.sh | 10 + configure | 20 +- configure.ac | 2 +- cpu-miner.c | 2 +- simd-utils/intrlv.h | 39 ++ simd-utils/simd-256.h | 21 +- simd-utils/simd-512.h | 36 +- winbuild-cross.sh | 7 +- 118 files changed, 7030 insertions(+), 1575 deletions(-) create mode 100644 algo/echo/aes_ni/hash.c.test create mode 100644 algo/groestl/groestl-4way.c create mode 100644 algo/groestl/groestl-gate.c create mode 100644 algo/groestl/groestl-gate.h create mode 100644 algo/groestl/groestl256-hash-4way.c create mode 100644 algo/groestl/groestl256-hash-4way.h create mode 100644 algo/groestl/groestl256-intr-4way.h create mode 100644 algo/groestl/groestl512-hash-4way.c create mode 100644 algo/groestl/groestl512-hash-4way.h create mode 100644 algo/groestl/groestl512-intr-4way.h create mode 100644 algo/shavite/shavite-hash-4way.c create mode 100644 algo/shavite/shavite-hash-4way.h create mode 100755 build-avx2.sh create mode 100755 clean-all.sh diff --git a/Makefile.am b/Makefile.am index 916bccb..fe5bcf7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -87,8 +87,11 @@ cpuminer_SOURCES = \ algo/echo/echo-hash-4way.c \ algo/echo/aes_ni/hash.c\ algo/gost/sph_gost.c \ + algo/groestl/groestl-gate.c \ + algo/groestl/groestl512-hash-4way.c \ algo/groestl/sph_groestl.c \ algo/groestl/groestl.c \ + algo/groestl/groestl-4way.c \ algo/groestl/myrgr-gate.c \ algo/groestl/myrgr-4way.c \ algo/groestl/myr-groestl.c \ @@ -188,6 +191,7 @@ cpuminer_SOURCES = \ algo/shavite/sph_shavite.c \ algo/shavite/sph-shavite-aesni.c \ algo/shavite/shavite-hash-2way.c \ + algo/shavite/shavite-hash-4way.c \ algo/shavite/shavite.c \ algo/simd/sph_simd.c \ algo/simd/nist.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 51ded93..0228c77 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -21,7 +21,7 @@ required. Compile Instructions -------------------- -See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions +See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions Requirements ------------ @@ -35,13 +35,26 @@ not supported. FreeBSD YMMV. Change Log ---------- +v3.11.0 + +Fixed x25x AVX512 lane 4 invalid shares. + +AVX512 for hex, phi2. + +VAES optimzation for Intel Icelake CPUs for most algos recently optimized +with AVX512, source code only. + +v3.10.7 + +AVX512 for x25x, lbry, x13bcd (bcd). + v3.10.6 Added support for SSL stratum: stratum+tcps:// Added job id reporting again, but leaner, suppressed with --quiet. -AVX512 for x21s, x22i, lyra2z, allium +AVX512 for x21s, x22i, lyra2z, allium. Fixed share overflow warnings mining lbry with Ryzen (SHA). diff --git a/algo-gate-api.c b/algo-gate-api.c index cebfc8b..f77ee29 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -317,6 +317,7 @@ const char* const algo_alias_map[][2] = { "argon2d-crds", "argon2d250" }, { "argon2d-dyn", "argon2d500" }, { "argon2d-uis", "argon2d4096" }, + { "bcd", "x13bcd" }, { "bitcore", "timetravel10" }, { "bitzeny", "yescryptr8" }, { "blake256r8", "blakecoin" }, diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 9f389f6..091a537 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -104,7 +104,7 @@ typedef struct { typedef blake_8way_small_context blake256_8way_context; void blake256_8way_init(void *cc); void blake256_8way_update(void *cc, const void *data, size_t len); -#define blake256_8way blake256_8way_update +//#define blake256_8way blake256_8way_update void blake256_8way_close(void *cc, void *dst); // 14 rounds, blake, decred diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index f958659..3de0363 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, } static void -blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len ) +blake32_4way( blake_4way_small_context *ctx, const void *data, + size_t len ) { __m128i *buf = (__m128i*)ctx->buf; size_t bptr = ctx->ptr<<2; @@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx) } void -blake256_4way(void *ctx, const void *data, size_t len) +blake256_4way_update(void *ctx, const void *data, size_t len) { blake32_4way(ctx, data, len); } diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h index 953841f..baf2865 100644 --- a/algo/blake/blake2s-hash-4way.h +++ b/algo/blake/blake2s-hash-4way.h @@ -14,7 +14,6 @@ #ifndef __BLAKE2S_HASH_4WAY_H__ #define __BLAKE2S_HASH_4WAY_H__ 1 -//#if defined(__SSE4_2__) #if defined(__SSE2__) #include "simd-utils.h" @@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ); } #endif -#endif // __SSE4_2__ +#endif // __SSE2__ #endif diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c index 2757fdd..795be11 100644 --- a/algo/bmw/bmw512-4way.c +++ b/algo/bmw/bmw512-4way.c @@ -41,7 +41,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hash7[ lane<<1 ] < Htarg ) ) -// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) ) @@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input) { bmw512_4way_context ctx; bmw512_4way_init( &ctx ); - bmw512_4way( &ctx, input, 80 ); + bmw512_4way_update( &ctx, input, 80 ); bmw512_4way_close( &ctx, state ); } diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 8b9d010..c508248 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -21,7 +21,27 @@ static void transform( cubehashParam *sp ) int r; const int rounds = sp->rounds; -#ifdef __AVX2__ +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + register __m512i x0, x1; + + x0 = _mm512_load_si512( (__m512i*)sp->x ); + x1 = _mm512_load_si512( (__m512i*)sp->x + 1 ); + + for ( r = 0; r < rounds; ++r ) + { + x1 = _mm512_add_epi32( x0, x1 ); + x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 ); + x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) ); + x0 = _mm512_xor_si512( mm512_rol_32( + mm512_swap256_128( x0 ), 11 ), x1 ); + x1 = mm512_swap64_32( x1 ); + } + + _mm512_store_si512( (__m512i*)sp->x, x0 ); + _mm512_store_si512( (__m512i*)sp->x + 1, x1 ); + +#elif defined(__AVX2__) register __m256i x0, x1, x2, x3, y0, y1; diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 7dd48e4..f736697 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -186,7 +186,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc { for(i = 0; i < 4; i++) { - _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); } } @@ -390,13 +390,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval) } // Store the hash value - _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); - _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); + _mm_store_si128((__m128i*)hashval + 0, state->state[0][0]); + _mm_store_si128((__m128i*)hashval + 1, state->state[1][0]); if(state->uHashSize == 512) { - _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); - _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); + _mm_store_si128((__m128i*)hashval + 2, state->state[2][0]); + _mm_store_si128((__m128i*)hashval + 3, state->state[3][0]); } return SUCCESS; @@ -513,13 +513,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, } // Store the hash value - _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] ); - _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] ); + _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] ); + _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] ); if( state->uHashSize == 512 ) { - _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] ); - _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] ); + _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] ); + _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] ); } return SUCCESS; diff --git a/algo/echo/aes_ni/hash.c.test b/algo/echo/aes_ni/hash.c.test new file mode 100644 index 0000000..41d5a57 --- /dev/null +++ b/algo/echo/aes_ni/hash.c.test @@ -0,0 +1,620 @@ +/* + * file : echo_vperm.c + * version : 1.0.208 + * date : 14.12.2010 + * + * - vperm and aes_ni implementations of hash function ECHO + * - implements NIST hash api + * - assumes that message lenght is multiple of 8-bits + * - _ECHO_VPERM_ must be defined if compiling with ../main.c + * - define NO_AES_NI for aes_ni version + * + * Cagdas Calik + * ccalik@metu.edu.tr + * Institute of Applied Mathematics, Middle East Technical University, Turkey. + * + */ +#if defined(__AES__) + +#include +#include "miner.h" +#include "hash_api.h" +//#include "vperm.h" +#include +/* +#ifndef NO_AES_NI +#include +#else +#include +#endif +*/ + +MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F}; +MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC}; +MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1}; +MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C}; +MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1}; +MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8}; +MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09}; +MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79}; +MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8}; +MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170}; +MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1}; +MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363}; +MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6}; +MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b}; +MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e}; +MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e}; +MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515}; +MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c}; +MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601}; +MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06}; +MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b}; + + +MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000}; +MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000}; +MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101}; +MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c}; +MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; +MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; + + +#define ECHO_SUBBYTES(state, i, j) \ + state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ + state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\ + k1 = _mm_add_epi32(k1, M128(const1)) + +#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \ + s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\ + t1 = _mm_srli_epi16(state1[0][j], 7);\ + t1 = _mm_and_si128(t1, M128(lsbmask));\ + t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ + s2 = _mm_xor_si128(s2, t2);\ + state2[0][j] = s2;\ + state2[1][j] = state1[0][j];\ + state2[2][j] = state1[0][j];\ + state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\ + s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\ + t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\ + t1 = _mm_and_si128(t1, M128(lsbmask));\ + t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ + s2 = _mm_xor_si128(s2, t2);\ + state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\ + state2[1][j] = _mm_xor_si128(state2[1][j], s2);\ + state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\ + state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\ + s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\ + t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\ + t1 = _mm_and_si128(t1, M128(lsbmask));\ + t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ + s2 = _mm_xor_si128(s2, t2);\ + state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\ + state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\ + state2[2][j] = _mm_xor_si128(state2[2][j], s2);\ + state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\ + s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\ + t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\ + t1 = _mm_and_si128(t1, M128(lsbmask));\ + t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ + s2 = _mm_xor_si128(s2, t2);\ + state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\ + state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\ + state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\ + state2[3][j] = _mm_xor_si128(state2[3][j], s2) + + +#define ECHO_ROUND_UNROLL2 \ + ECHO_SUBBYTES(_state, 0, 0);\ + ECHO_SUBBYTES(_state, 1, 0);\ + ECHO_SUBBYTES(_state, 2, 0);\ + ECHO_SUBBYTES(_state, 3, 0);\ + ECHO_SUBBYTES(_state, 0, 1);\ + ECHO_SUBBYTES(_state, 1, 1);\ + ECHO_SUBBYTES(_state, 2, 1);\ + ECHO_SUBBYTES(_state, 3, 1);\ + ECHO_SUBBYTES(_state, 0, 2);\ + ECHO_SUBBYTES(_state, 1, 2);\ + ECHO_SUBBYTES(_state, 2, 2);\ + ECHO_SUBBYTES(_state, 3, 2);\ + ECHO_SUBBYTES(_state, 0, 3);\ + ECHO_SUBBYTES(_state, 1, 3);\ + ECHO_SUBBYTES(_state, 2, 3);\ + ECHO_SUBBYTES(_state, 3, 3);\ + ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ + ECHO_SUBBYTES(_state2, 0, 0);\ + ECHO_SUBBYTES(_state2, 1, 0);\ + ECHO_SUBBYTES(_state2, 2, 0);\ + ECHO_SUBBYTES(_state2, 3, 0);\ + ECHO_SUBBYTES(_state2, 0, 1);\ + ECHO_SUBBYTES(_state2, 1, 1);\ + ECHO_SUBBYTES(_state2, 2, 1);\ + ECHO_SUBBYTES(_state2, 3, 1);\ + ECHO_SUBBYTES(_state2, 0, 2);\ + ECHO_SUBBYTES(_state2, 1, 2);\ + ECHO_SUBBYTES(_state2, 2, 2);\ + ECHO_SUBBYTES(_state2, 3, 2);\ + ECHO_SUBBYTES(_state2, 0, 3);\ + ECHO_SUBBYTES(_state2, 1, 3);\ + ECHO_SUBBYTES(_state2, 2, 3);\ + ECHO_SUBBYTES(_state2, 3, 3);\ + ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ + ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) + + + +#define SAVESTATE(dst, src)\ + dst[0][0] = src[0][0];\ + dst[0][1] = src[0][1];\ + dst[0][2] = src[0][2];\ + dst[0][3] = src[0][3];\ + dst[1][0] = src[1][0];\ + dst[1][1] = src[1][1];\ + dst[1][2] = src[1][2];\ + dst[1][3] = src[1][3];\ + dst[2][0] = src[2][0];\ + dst[2][1] = src[2][1];\ + dst[2][2] = src[2][2];\ + dst[2][3] = src[2][3];\ + dst[3][0] = src[3][0];\ + dst[3][1] = src[3][1];\ + dst[3][2] = src[3][2];\ + dst[3][3] = src[3][3] + + +void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount) +{ + unsigned int r, b, i, j; + __m128i t1, t2, s2, k1; + __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; + + for(i = 0; i < 4; i++) + for(j = 0; j < ctx->uHashSize / 256; j++) + _state[i][j] = ctx->state[i][j]; + + for(b = 0; b < uBlockCount; b++) + { + ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); + + // load message + for(j = ctx->uHashSize / 256; j < 4; j++) + { + for(i = 0; i < 4; i++) + { + _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + } + } + +uint64_t *b = (uint64_t*)_state; +//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); + + // save state + SAVESTATE(_statebackup, _state); + + k1 = ctx->k; + + for(r = 0; r < ctx->uRounds / 2; r++) + { + ECHO_ROUND_UNROLL2; + } + +//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); + + + if(ctx->uHashSize == 256) + { + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); + } + } + else + { + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); + } + } + pmsg += ctx->uBlockLength; + } + SAVESTATE(ctx->state, _state); + +} + + + +HashReturn init_echo(hashState_echo *ctx, int nHashSize) +{ + int i, j; + + ctx->k = _mm_setzero_si128(); + ctx->processed_bits = 0; + ctx->uBufferBytes = 0; + + switch(nHashSize) + { + case 256: + ctx->uHashSize = 256; + ctx->uBlockLength = 192; + ctx->uRounds = 8; + ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100); + ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600); + break; + + case 512: + ctx->uHashSize = 512; + ctx->uBlockLength = 128; + ctx->uRounds = 10; + ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200); + ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400); + break; + + default: + return BAD_HASHBITLEN; + } + + + for(i = 0; i < 4; i++) + for(j = 0; j < nHashSize / 256; j++) + ctx->state[i][j] = ctx->hashsize; + + for(i = 0; i < 4; i++) + for(j = nHashSize / 256; j < 4; j++) + ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0); + + return SUCCESS; +} + +HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen) +{ + unsigned int uByteLength, uBlockCount, uRemainingBytes; + + uByteLength = (unsigned int)(databitlen / 8); + + if((state->uBufferBytes + uByteLength) >= state->uBlockLength) + { + if(state->uBufferBytes != 0) + { + // Fill the buffer + memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes); + + // Process buffer + Compress(state, state->buffer, 1); + state->processed_bits += state->uBlockLength * 8; + + data += state->uBlockLength - state->uBufferBytes; + uByteLength -= state->uBlockLength - state->uBufferBytes; + } + + // buffer now does not contain any unprocessed bytes + + uBlockCount = uByteLength / state->uBlockLength; + uRemainingBytes = uByteLength % state->uBlockLength; + + if(uBlockCount > 0) + { + Compress(state, data, uBlockCount); + + state->processed_bits += uBlockCount * state->uBlockLength * 8; + data += uBlockCount * state->uBlockLength; + } + + if(uRemainingBytes > 0) + { + memcpy(state->buffer, (void*)data, uRemainingBytes); + } + + state->uBufferBytes = uRemainingBytes; + } + else + { + memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength); + state->uBufferBytes += uByteLength; + } + + return SUCCESS; +} + +HashReturn final_echo(hashState_echo *state, BitSequence *hashval) +{ + __m128i remainingbits; + + // Add remaining bytes in the buffer + state->processed_bits += state->uBufferBytes * 8; + + remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8); + + // Pad with 0x80 + state->buffer[state->uBufferBytes++] = 0x80; + + // Enough buffer space for padding in this block? + if((state->uBlockLength - state->uBufferBytes) >= 18) + { + // Pad with zeros + memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18)); + + // Hash size + *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; + + // Processed bits + *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; + *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; + + // Last block contains message bits? + if(state->uBufferBytes == 1) + { + state->k = _mm_xor_si128(state->k, state->k); + state->k = _mm_sub_epi64(state->k, state->const1536); + } + else + { + state->k = _mm_add_epi64(state->k, remainingbits); + state->k = _mm_sub_epi64(state->k, state->const1536); + } + + // Compress + Compress(state, state->buffer, 1); + } + else + { + // Fill with zero and compress + memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes); + state->k = _mm_add_epi64(state->k, remainingbits); + state->k = _mm_sub_epi64(state->k, state->const1536); + Compress(state, state->buffer, 1); + + // Last block + memset(state->buffer, 0, state->uBlockLength - 18); + + // Hash size + *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; + + // Processed bits + *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; + *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; + + // Compress the last block + state->k = _mm_xor_si128(state->k, state->k); + state->k = _mm_sub_epi64(state->k, state->const1536); + Compress(state, state->buffer, 1); + } + + // Store the hash value + _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); + _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); + + if(state->uHashSize == 512) + { + _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); + _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); + } + + return SUCCESS; +} + +HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, + const BitSequence *data, DataLength databitlen ) +{ + unsigned int uByteLength, uBlockCount, uRemainingBytes; + + uByteLength = (unsigned int)(databitlen / 8); + +/* + if( (state->uBufferBytes + uByteLength) >= state->uBlockLength ) + { +printf("full block\n"); + if( state->uBufferBytes != 0 ) + { + // Fill the buffer + memcpy( state->buffer + state->uBufferBytes, + (void*)data, state->uBlockLength - state->uBufferBytes ); + + // Process buffer + Compress( state, state->buffer, 1 ); + state->processed_bits += state->uBlockLength * 8; + + data += state->uBlockLength - state->uBufferBytes; + uByteLength -= state->uBlockLength - state->uBufferBytes; + } + + // buffer now does not contain any unprocessed bytes + + uBlockCount = uByteLength / state->uBlockLength; + uRemainingBytes = uByteLength % state->uBlockLength; + + if( uBlockCount > 0 ) + { + Compress( state, data, uBlockCount ); + state->processed_bits += uBlockCount * state->uBlockLength * 8; + data += uBlockCount * state->uBlockLength; + } + + if( uRemainingBytes > 0 ) + memcpy(state->buffer, (void*)data, uRemainingBytes); + + state->uBufferBytes = uRemainingBytes; + } + else + { +*/ + memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength ); + state->uBufferBytes += uByteLength; +// } + + __m128i remainingbits; + + // Add remaining bytes in the buffer + state->processed_bits += state->uBufferBytes * 8; + + remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 ); + + // Pad with 0x80 + state->buffer[state->uBufferBytes++] = 0x80; + + // Enough buffer space for padding in this block? + +// if( (state->uBlockLength - state->uBufferBytes) >= 18 ) +// { + // Pad with zeros + + memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + + + // Last block contains message bits? + if( state->uBufferBytes == 1 ) + { + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + else + { + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + +uint64_t *b = (uint64_t*)&state->k; +/* +printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); +b = (uint64_t*)state->buffer; +printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); +printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]); +printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]); +printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]); + +b = (uint64_t*)state->state; +printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); +printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]); +printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]); +printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]); +*/ + // Compress + Compress( state, state->buffer, 1 ); + +//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); + + +/* + } + else + { + // Fill with zero and compress + memset( state->buffer + state->uBufferBytes, 0, + state->uBlockLength - state->uBufferBytes ); + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1 ); + + // Last block + memset( state->buffer, 0, state->uBlockLength - 18 ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = + state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + // Compress the last block + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1) ; + } +*/ + + // Store the hash value + _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] ); + _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] ); + + if( state->uHashSize == 512 ) + { + _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] ); + _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] ); + + } + return SUCCESS; +} + + +HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) +{ + HashReturn hRet; + hashState_echo hs; + + ///// + /* + __m128i a, b, c, d, t[4], u[4], v[4]; + + a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100); + b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110); + c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120); + d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130); + + t[0] = _mm_unpacklo_epi8(a, b); + t[1] = _mm_unpackhi_epi8(a, b); + t[2] = _mm_unpacklo_epi8(c, d); + t[3] = _mm_unpackhi_epi8(c, d); + + u[0] = _mm_unpacklo_epi16(t[0], t[2]); + u[1] = _mm_unpackhi_epi16(t[0], t[2]); + u[2] = _mm_unpacklo_epi16(t[1], t[3]); + u[3] = _mm_unpackhi_epi16(t[1], t[3]); + + + t[0] = _mm_unpacklo_epi16(u[0], u[1]); + t[1] = _mm_unpackhi_epi16(u[0], u[1]); + t[2] = _mm_unpacklo_epi16(u[2], u[3]); + t[3] = _mm_unpackhi_epi16(u[2], u[3]); + + u[0] = _mm_unpacklo_epi8(t[0], t[1]); + u[1] = _mm_unpackhi_epi8(t[0], t[1]); + u[2] = _mm_unpacklo_epi8(t[2], t[3]); + u[3] = _mm_unpackhi_epi8(t[2], t[3]); + + a = _mm_unpacklo_epi8(u[0], u[1]); + b = _mm_unpackhi_epi8(u[0], u[1]); + c = _mm_unpacklo_epi8(u[2], u[3]); + d = _mm_unpackhi_epi8(u[2], u[3]); + */ + ///// + + hRet = init_echo(&hs, hashbitlen); + if(hRet != SUCCESS) + return hRet; + + hRet = update_echo(&hs, data, databitlen); + if(hRet != SUCCESS) + return hRet; + + hRet = final_echo(&hs, hashval); + if(hRet != SUCCESS) + return hRet; + + return SUCCESS; +} + +#endif diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index f8f408a..10a4f71 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -1,78 +1,37 @@ -#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +//#if 0 +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #include "simd-utils.h" #include "echo-hash-4way.h" /* -#include -#include "miner.h" -#include "hash_api.h" -//#include "vperm.h" -#include +static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) = +{ + 0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, + 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234 +}; */ -/* -#ifndef NO_AES_NI -#include -#else -#include -#endif -*/ - -// not used -/* -const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F}; -const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC}; -const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1}; -const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C}; -const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1}; -const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8}; -const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09}; -const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79}; -const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8}; -const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170}; -const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1}; -const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363}; -const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6}; -const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b}; -const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e}; -const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e}; -const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515}; -const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c}; -const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601}; -const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06}; -const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b}; -*/ - -/* -MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000}; -MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000}; -MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101}; -MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c}; -MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; -*/ - -MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; - // do these need to be reversed? #define mul2mask \ - m512_const4_32( 0x00001b00, 0, 0, 0 ) + _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) +// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 ) #define lsbmask m512_const1_32( 0x01010101 ) #define ECHO_SUBBYTES( state, i, j ) \ state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \ state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \ - k1 = _mm512_add_epi32( k1, m512_one_32 ) + k1 = _mm512_add_epi32( k1, m512_one_128 ); #define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \ { \ - const int j1 = ( j+1 ) & 3; \ - const int j2 = ( j+2 ) & 3; \ - const int j3 = ( j+3 ) & 3; \ + const int j1 = ( (j)+1 ) & 3; \ + const int j2 = ( (j)+2 ) & 3; \ + const int j3 = ( (j)+3 ) & 3; \ s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \ t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \ - t1 = _mm512_and_si128( t1, lsbmask );\ + t1 = _mm512_and_si512( t1, lsbmask );\ t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \ s2 = _mm512_xor_si512( s2, t2 ); \ state2[ 0 ] [j ] = s2; \ @@ -97,7 +56,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \ state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \ _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \ - state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \ + state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \ state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \ s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \ t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \ @@ -108,12 +67,12 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \ state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \ _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \ - state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ) + state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \ } while(0) #define ECHO_ROUND_UNROLL2 \ ECHO_SUBBYTES(_state, 0, 0);\ - ECHO_SUBBYTES(_state, 1, 0);\ + ECHO_SUBBYTES(_state, 1, 0);\ ECHO_SUBBYTES(_state, 2, 0);\ ECHO_SUBBYTES(_state, 3, 0);\ ECHO_SUBBYTES(_state, 0, 1);\ @@ -153,8 +112,6 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) - - #define SAVESTATE(dst, src)\ dst[0][0] = src[0][0];\ dst[0][1] = src[0][1];\ @@ -173,33 +130,44 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2 dst[3][2] = src[3][2];\ dst[3][3] = src[3][3] - -void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg, +// blockcount always 1 +void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg, unsigned int uBlockCount ) { unsigned int r, b, i, j; __m512i t1, t2, s2, k1; __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; -// unroll - for ( i = 0; i < 4; i++ ) - for ( j = 0; j < ctx->uHashSize / 256; j++ ) - _state[ i ][ j ] = ctx->state[ i ][ j ]; + _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ]; + _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ]; + _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ]; + _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ]; + _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ]; + _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ]; + _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ]; + _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ]; + _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ]; + _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ]; + _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ]; + _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ]; + _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ]; + _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ]; + _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ]; + _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ]; for ( b = 0; b < uBlockCount; b++ ) { ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 ); - // load message, make aligned, remove loadu for( j = ctx->uHashSize / 256; j < 4; j++ ) { for ( i = 0; i < 4; i++ ) { - _state[ i ][ j ] = _mm512_loadu_si512( - (__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i ); + _state[ i ][ j ] = _mm512_load_si512( + pmsg + 4 * (j - (ctx->uHashSize / 256)) + i ); } } - + // save state SAVESTATE( _statebackup, _state ); @@ -254,8 +222,6 @@ void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg, } - - int echo_4way_init( echo_4way_context *ctx, int nHashSize ) { int i, j; @@ -270,23 +236,22 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize ) ctx->uHashSize = 256; ctx->uBlockLength = 192; ctx->uRounds = 8; - ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 ); - ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 ); + ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 ); + ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 ); break; case 512: ctx->uHashSize = 512; ctx->uBlockLength = 128; ctx->uRounds = 10; - ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 ); - ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400); + ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 ); + ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400); break; default: - return BAD_HASHBITLEN; + return 1; } - for( i = 0; i < 4; i++ ) for( j = 0; j < nHashSize / 256; j++ ) ctx->state[ i ][ j ] = ctx->hashsize; @@ -295,263 +260,56 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize ) for( j = nHashSize / 256; j < 4; j++ ) ctx->state[ i ][ j ] = m512_zero; - return SUCCESS; -} - -int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen ) -{ - unsigned int uByteLength, uBlockCount, uRemainingBytes; - - uByteLength = (unsigned int)(databitlen / 8); - - if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength ) - { - if ( state->uBufferBytes != 0 ) - { - // Fill the buffer - memcpy( state->buffer + state->uBufferBytes, - (void*)data, state->uBlockLength - state->uBufferBytes ); - - // Process buffer - echo_4way_compress( state, state->buffer, 1 ); - state->processed_bits += state->uBlockLength * 8; - - data += state->uBlockLength - state->uBufferBytes; - uByteLength -= state->uBlockLength - state->uBufferBytes; - } - - // buffer now does not contain any unprocessed bytes - - uBlockCount = uByteLength / state->uBlockLength; - uRemainingBytes = uByteLength % state->uBlockLength; - - if ( uBlockCount > 0 ) - { - echo_4way_compress( state, data, uBlockCount ); - - state->processed_bits += uBlockCount * state->uBlockLength * 8; - data += uBlockCount * state->uBlockLength; - } - - if ( uRemainingBytes > 0 ) - { - memcpy( state->buffer, (void*)data, uRemainingBytes ); - } - - state->uBufferBytes = uRemainingBytes; - } - else - { - memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength ); - state->uBufferBytes += uByteLength; - } - return 0; } -echo_4way_close( echo_4way_context *state, BitSequence *hashval ) +int echo_4way_update_close( echo_4way_context *state, void *hashval, + const void *data, int databitlen ) { - __m512i remainingbits; +// bytelen is either 32 (maybe), 64 or 80 or 128! +// all are less than full block. - // Add remaining bytes in the buffer - state->processed_bits += state->uBufferBytes * 8; + int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane + const int vblen = state->uBlockLength / 16; // 16 bytes per lane + __m512i remainingbits; - remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 ); - - // Pad with 0x80 - state->buffer[ state->uBufferBytes++ ] = 0x80; - - // Enough buffer space for padding in this block? - if ( ( state->uBlockLength - state->uBufferBytes ) >= 18) - { - // Pad with zeros - memset( state->buffer + state->uBufferBytes, 0, - state->uBlockLength - ( state->uBufferBytes + 18 ) ); - - // Hash size - *( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) ) - = state->uHashSize; - - // Processed bits - *( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) ) - = state->processed_bits; - *( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0; - - // Last block contains message bits? - if ( state->uBufferBytes == 1 ) - { - state->k = _mm512_xor_si512( state->k, state->k ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - } - else - { - state->k = _mm512_add_epi64( state->k, remainingbits ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - } - - // Compress - echo_4way_compress( state, state->buffer, 1 ); - } - else - { - // Fill with zero and compress - memset( state->buffer + state->uBufferBytes, 0, - state->uBlockLength - state->uBufferBytes ); - state->k = _mm512_add_epi64( state->k, remainingbits ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - echo_4way_compress( state, state->buffer, 1 ); - - // Last block - memset( state->buffer, 0, state->uBlockLength - 18 ); - - // Hash size - *( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) ) - = state->uHashSize; - - // Processed bits - *( (DataLength*)( state->buffer + state->uBlockLength - 16 ) ) - = state->processed_bits; - *( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0; - - // Compress the last block - state->k = _mm512_xor_si512(state->k, state->k); - state->k = _mm512_sub_epi64(state->k, state->const1536); - echo_4way_compress(state, state->buffer, 1); - } - - // Store the hash value - _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]); - _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]); - - if ( state->uHashSize == 512 ) - { - _mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]); - _mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]); - } - - return 0; -} - -int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ) -{ - unsigned int uByteLength, uBlockCount, uRemainingBytes; - - uByteLength = (unsigned int)(databitlen / 8); - - if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength ) - { - if ( state->uBufferBytes != 0 ) - { - // Fill the buffer - memcpy( state->buffer + state->uBufferBytes, - (void*)data, state->uBlockLength - state->uBufferBytes ); - - // Process buffer - echo_4way_compress( state, state->buffer, 1 ); - state->processed_bits += state->uBlockLength * 8; - - data += state->uBlockLength - state->uBufferBytes; - uByteLength -= state->uBlockLength - state->uBufferBytes; - } - - // buffer now does not contain any unprocessed bytes - - uBlockCount = uByteLength / state->uBlockLength; - uRemainingBytes = uByteLength % state->uBlockLength; - - if ( uBlockCount > 0 ) - { - echo_4way_compress( state, data, uBlockCount ); - state->processed_bits += uBlockCount * state->uBlockLength * 8; - data += uBlockCount * state->uBlockLength; - } - - if ( uRemainingBytes > 0 ) - memcpy(state->buffer, (void*)data, uRemainingBytes); - state->uBufferBytes = uRemainingBytes; - } - else - { - memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength ); - state->uBufferBytes += uByteLength; - } - - __m512i remainingbits; - - // Add remaining bytes in the buffer - state->processed_bits += state->uBufferBytes * 8; - - remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 ); - - // Pad with 0x80 - state->buffer[ state->uBufferBytes++ ] = 0x80; - // Enough buffer space for padding in this block? - if ( (state->uBlockLength - state->uBufferBytes) >= 18 ) + if ( databitlen == 1024 ) { - // Pad with zeros - memset( state->buffer + state->uBufferBytes, 0,i - state->uBlockLength - (state->uBufferBytes + 18) ); + echo_4way_compress( state, data, 1 ); + state->processed_bits = 1024; + remainingbits = m512_zero; + vlen = 0; + } + else + { + vlen = databitlen / 128; // * 4 lanes / 128 bits per lane + memcpy_512( state->buffer, data, vlen ); + + state->processed_bits += (unsigned int)( databitlen ); + remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen ); - // Hash size - *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) - = state->uHashSize; + } - // Processed bits - *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = - state->processed_bits; - *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); + memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 ); + state->buffer[ vblen-2 ] = + _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 ); + state->buffer[ vblen-1 ] = + _mm512_set4_epi64( 0, state->processed_bits, + 0, state->processed_bits ); - // Last block contains message bits? - if( state->uBufferBytes == 1 ) - { - state->k = _mm512_xor_si512( state->k, state->k ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - } - else - { - state->k = _mm_add_epi64( state->k, remainingbits ); - state->k = _mm_sub_epi64( state->k, state->const1536 ); - } + state->k = _mm512_add_epi64( state->k, remainingbits ); + state->k = _mm512_sub_epi64( state->k, state->const1536 ); - // Compress - echo_4way_compress( state, state->buffer, 1 ); - } - else - { - // Fill with zero and compress - memset( state->buffer + state->uBufferBytes, 0, - state->uBlockLength - state->uBufferBytes ); - state->k = _mm512_add_epi64( state->k, remainingbits ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - echo_4way_compress( state, state->buffer, 1 ); + echo_4way_compress( state, state->buffer, 1 ); - // Last block - memset( state->buffer, 0, state->uBlockLength - 18 ); - - // Hash size - *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = - state->uHashSize; - - // Processed bits - *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = - state->processed_bits; - *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; - // Compress the last block - state->k = _mm512_xor_si512( state->k, state->k ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); - echo_4way_compress( state, state->buffer, 1) ; - } - - // Store the hash value - _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] ); - _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] ); + _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] ); + _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] ); if ( state->uHashSize == 512 ) { - _mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] ); - _mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] ); - + _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] ); + _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] ); } return 0; } diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index 2a56aad..10092da 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -73,7 +73,7 @@ __m128i ALL_FF; b5 = a7;\ a6 = _mm_xor_si128(a6, a7);\ a7 = _mm_xor_si128(a7, b6);\ - \ + \ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ b0 = _mm_xor_si128(b0, a4);\ b6 = _mm_xor_si128(b6, a4);\ @@ -195,7 +195,7 @@ __m128i ALL_FF; for(round_counter = 0; round_counter < 14; round_counter+=2) {\ /* AddRoundConstant P1024 */\ xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ @@ -209,7 +209,6 @@ __m128i ALL_FF; \ /* AddRoundConstant P1024 */\ xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ @@ -218,7 +217,6 @@ __m128i ALL_FF; xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ - /* SubBytes + MixBytes */\ SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ }\ } diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c index e77aab9..2adffd9 100644 --- a/algo/groestl/aes_ni/hash-groestl.c +++ b/algo/groestl/aes_ni/hash-groestl.c @@ -230,6 +230,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, // digest final padding block and do output transform TF1024( ctx->chaining, ctx->buffer ); + OF1024( ctx->chaining ); // store hash result in output diff --git a/algo/groestl/groestl-4way.c b/algo/groestl/groestl-4way.c new file mode 100644 index 0000000..b545146 --- /dev/null +++ b/algo/groestl/groestl-4way.c @@ -0,0 +1,64 @@ +#include "groestl-gate.h" +#include +#include +#include +#include + +#if defined(GROESTL_4WAY_VAES) + +#include "groestl512-hash-4way.h" + +void groestl_4way_hash( void *output, const void *input ) +{ + uint32_t hash[16*4] __attribute__ ((aligned (128))); + groestl512_4way_context ctx; + + groestl512_4way_init( &ctx, 64 ); + groestl512_4way_update_close( &ctx, hash, input, 640 ); + + groestl512_4way_init( &ctx, 64 ); + groestl512_4way_update_close( &ctx, hash, hash, 512 ); + + dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 ); + } + +int scanhash_groestl_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*4] __attribute__ ((aligned (128))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t *noncep = vdata + 64+3; // 4*16 + 3 + int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + + mm512_bswap32_intrlv80_4x128( vdata, pdata ); + + do + { + be32enc( noncep, n ); + be32enc( noncep+ 4, n+1 ); + be32enc( noncep+ 8, n+2 ); + be32enc( noncep+12, n+3 ); + + groestl_4way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 4; lane++ ) + if ( ( hash+(lane<<3) )[7] < Htarg ) + if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, hash+(lane<<3), mythr, lane ); + } + n += 4; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; + return 0; +} + +#endif diff --git a/algo/groestl/groestl-gate.c b/algo/groestl/groestl-gate.c new file mode 100644 index 0000000..92c79bc --- /dev/null +++ b/algo/groestl/groestl-gate.c @@ -0,0 +1,23 @@ +#include "groestl-gate.h" + +bool register_dmd_gr_algo( algo_gate_t *gate ) +{ +#if defined (GROESTL_4WAY_VAES) + gate->scanhash = (void*)&scanhash_groestl_4way; + gate->hash = (void*)&groestl_4way_hash; +#else + init_groestl_ctx(); + gate->scanhash = (void*)&scanhash_groestl; + gate->hash = (void*)&groestlhash; +#endif + gate->optimizations = AES_OPT | VAES_OPT; + return true; +}; + +bool register_groestl_algo( algo_gate_t* gate ) +{ + register_dmd_gr_algo( gate ); + gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; + return true; +}; + diff --git a/algo/groestl/groestl-gate.h b/algo/groestl/groestl-gate.h new file mode 100644 index 0000000..25551e6 --- /dev/null +++ b/algo/groestl/groestl-gate.h @@ -0,0 +1,31 @@ +#ifndef GROESTL_GATE_H__ +#define GROESTL_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define GROESTL_4WAY_VAES 1 +#endif + +bool register_dmd_gr_algo( algo_gate_t* gate ); + +bool register_groestl_algo( algo_gate_t* gate ); + +#if defined(GROESTL_4WAY_VAES) + +void groestl_4way_hash( void *state, const void *input ); +int scanhash_groestl_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#else + +void groestlhash( void *state, const void *input ); +int scanhash_groestl( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_groestl_ctx(); + +#endif + +#endif + diff --git a/algo/groestl/groestl.c b/algo/groestl/groestl.c index df1c2c3..517dfb8 100644 --- a/algo/groestl/groestl.c +++ b/algo/groestl/groestl.c @@ -1,5 +1,4 @@ -#include "algo-gate-api.h" - +#include "groestl-gate.h" #include #include #include @@ -78,15 +77,12 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce, groestlhash(hash, endiandata); if (hash[7] <= Htarg ) - if ( fulltest(hash, ptarget)) - { + if ( fulltest(hash, ptarget) && !opt_benchmark ) + { pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 1; - } - + submit_solution( work, hash, mythr ); + } nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); pdata[19] = nonce; @@ -94,20 +90,3 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce, return 0; } -bool register_dmd_gr_algo( algo_gate_t* gate ) -{ - init_groestl_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT; - gate->scanhash = (void*)&scanhash_groestl; - gate->hash = (void*)&groestlhash; - opt_target_factor = 256.0; - return true; -}; - -bool register_groestl_algo( algo_gate_t* gate ) -{ - register_dmd_gr_algo( gate ); - gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; - return true; -}; - diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c new file mode 100644 index 0000000..cee3eac --- /dev/null +++ b/algo/groestl/groestl256-hash-4way.c @@ -0,0 +1,280 @@ +/* hash.c Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#include +#include "hash-groestl256.h" +#include "miner.h" +#include "simd-utils.h" + +#ifndef NO_AES_NI + +#include "groestl-version.h" + +#ifdef TASM + #ifdef VAES + #include "groestl256-asm-aes.h" + #else + #ifdef VAVX + #include "groestl256-asm-avx.h" + #else + #ifdef VVPERM + #include "groestl256-asm-vperm.h" + #else + #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) + #endif + #endif + #endif +#else + #ifdef TINTR + #ifdef VAES + #include "groestl256-intr-aes.h" + #else + #ifdef VAVX + #include "groestl256-intr-avx.h" + #else + #ifdef VVPERM + #include "groestl256-intr-vperm.h" + #else + #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) + #endif + #endif + #endif + #else + #error NO TYPE SPECIFIED (-DT[ASM/INTR]) + #endif +#endif + +/* initialise context */ +HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) +{ + int i; + + ctx->hashlen = hashlen; + SET_CONSTANTS(); + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + for ( i = 0; i < SIZE256; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); + INIT256( ctx->chaining ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + return SUCCESS_GR; +} + + +HashReturn_gr reinit_groestl256(hashState_groestl256* ctx) + { + int i; + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return FAIL_GR; + + for ( i = 0; i < SIZE256; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); + INIT256(ctx->chaining); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + return SUCCESS_GR; +} + +// Use this only for midstate and never for cryptonight +HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input, + DataLength_gr databitlen ) +{ + __m128i* in = (__m128i*)input; + const int len = (int)databitlen / 128; // bits to __m128i + const int blocks = len / SIZE256; // __M128i to blocks + int rem = ctx->rem_ptr; + int i; + + ctx->blk_count = blocks; + ctx->databitlen = databitlen; + + // digest any full blocks + for ( i = 0; i < blocks; i++ ) + TF512( ctx->chaining, &in[ i * SIZE256 ] ); + // adjust buf_ptr to last block + ctx->buf_ptr = blocks * SIZE256; + + // Copy any remainder to buffer + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + // adjust rem_ptr for new data + ctx->rem_ptr += i; + + return SUCCESS_GR; +} + +// don't use this at all +HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output ) +{ + const int len = (int)ctx->databitlen / 128; // bits to __m128i + const int blocks = ctx->blk_count + 1; // adjust for final block + const int rem_ptr = ctx->rem_ptr; // end of data start of padding + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer + int i; + + // first pad byte = 0x80, last pad byte = block count + // everything in between is zero + + if ( rem_ptr == len - 1 ) + { + // all padding at once + ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + } + else + { + // add first padding + ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + // add zero padding + for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ ) + ctx->buffer[i] = _mm_setzero_si128(); + // add length padding + // cheat since we know the block count is trivial, good if block < 256 + ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0 ); + } + + // digest final padding block and do output transform + TF512( ctx->chaining, ctx->buffer ); + OF512( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m128i( output, i ) = ctx->chaining[ hash_offset + i]; + + return SUCCESS_GR; +} + +HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx, + void* output, const void* input, DataLength_gr databitlen ) +{ + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE256 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE256; + __m128i* in = (__m128i*)input; + int i; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF512( ctx->chaining, &in[ i * SIZE256 ] ); + ctx->buf_ptr = blocks * SIZE256; + + // cryptonight has 200 byte input, an odd number of __m128i + // remainder is only 8 bytes, ie u64. + if ( databitlen % 128 !=0 ) + { + // must be cryptonight, copy 64 bits of data + *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] ); + i = -1; // signal for odd length + } + else + { + // Copy any remaining data to buffer for final transform + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + } + + //--- final --- + + // adjust for final block + blocks++; + + if ( i == len - 1 ) + { + // all padding at once + ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, + 0, 0,0,0, 0,0,0,0x80 ); + } + else + { + if ( i == -1 ) + { + // cryptonight odd length + ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull; + // finish the block with zero and length padding as normal + i = 0; + } + else + { + // add first padding + ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + } + // add zero padding + for ( i += 1; i < SIZE256 - 1; i++ ) + ctx->buffer[i] = _mm_setzero_si128(); + // add length padding + // cheat since we know the block count is trivial, good if block < 256 + ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, + 0, 0,0,0, 0,0,0,0 ); + } + + // digest final padding block and do output transform + TF512( ctx->chaining, ctx->buffer ); + OF512( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return SUCCESS_GR; +} + +/* hash bit sequence */ +HashReturn_gr hash_groestl256(int hashbitlen, + const BitSequence_gr* data, + DataLength_gr databitlen, + BitSequence_gr* hashval) { + HashReturn_gr ret; + hashState_groestl256 context; + + /* initialise */ + if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR) + return ret; + + /* process message */ + if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR) + return ret; + + /* finalise */ + ret = final_groestl256(&context, hashval); + + return ret; +} + +/* eBash API */ +//#ifdef crypto_hash_BYTES +//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) +//{ +// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0; +// return -1; +//} +//#endif + +#endif diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h new file mode 100644 index 0000000..f82c1de --- /dev/null +++ b/algo/groestl/groestl256-hash-4way.h @@ -0,0 +1,121 @@ +/* hash.h Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#ifndef __hash_h +#define __hash_h + +#include +#include +#if defined(_WIN64) || defined(__WINDOWS__) +#include +#endif +#include + +/* eBash API begin */ +/* +#include "crypto_hash.h" +#ifdef crypto_hash_BYTES + +#include +#include +#include +typedef crypto_uint8 u8; +typedef crypto_uint32 u32; +typedef crypto_uint64 u64; +#endif + */ +/* eBash API end */ + +//#define LENGTH (512) + +#include "brg_endian.h" +#define NEED_UINT_64T +#include "algo/sha/brg_types.h" + +#ifdef IACA_TRACE + #include IACA_MARKS +#endif + +#define LENGTH (256) + +/* some sizes (number of bytes) */ +#define ROWS (8) +#define LENGTHFIELDLEN (ROWS) +#define COLS512 (8) +//#define COLS1024 (16) +#define SIZE_512 ((ROWS)*(COLS512)) +//#define SIZE1024 ((ROWS)*(COLS1024)) +#define ROUNDS512 (10) +//#define ROUNDS1024 (14) + +//#if LENGTH<=256 +#define COLS (COLS512) +//#define SIZE (SIZE512) +#define ROUNDS (ROUNDS512) +//#else +//#define COLS (COLS1024) +//#define SIZE (SIZE1024) +//#define ROUNDS (ROUNDS1024) +//#endif + +#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) +#define U64BIG(a) (a) +#endif /* IS_BIG_ENDIAN */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define U64BIG(a) \ + ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ + (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ + (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ + (ROTL64(a,56) & li_64(FF000000FF000000))) +#endif /* IS_LITTLE_ENDIAN */ + +typedef unsigned char BitSequence_gr; +typedef unsigned long long DataLength_gr; +typedef enum +{ + SUCCESS_GR = 0, + FAIL_GR = 1, + BAD_HASHBITLEN_GR = 2 +} HashReturn_gr; + +#define SIZE256 (SIZE_512/16) + +typedef struct { + __attribute__ ((aligned (32))) __m128i chaining[SIZE256]; + __attribute__ ((aligned (32))) __m128i buffer[SIZE256]; +// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ +// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */ +// u64 block_counter; /* message block counter */ + int hashlen; // bytes + int blk_count; + int buf_ptr; /* data buffer pointer */ + int rem_ptr; + int databitlen; +} hashState_groestl256; + +HashReturn_gr init_groestl256( hashState_groestl256*, int ); + +HashReturn_gr reinit_groestl256( hashState_groestl256* ); + +HashReturn_gr update_groestl256( hashState_groestl256*, const void*, + DataLength_gr ); + +HashReturn_gr final_groestl256( hashState_groestl256*, void* ); + +HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr, + BitSequence_gr* ); + +HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*, + const void*, DataLength_gr ); + +#endif /* __hash_h */ diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h new file mode 100644 index 0000000..57dd930 --- /dev/null +++ b/algo/groestl/groestl256-intr-4way.h @@ -0,0 +1,492 @@ +/* groestl-intr-aes.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, and aes + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + +#include +#include +#include "hash-groestl256.h" + +/* global constants */ +__m128i ROUND_CONST_Lx; +__m128i ROUND_CONST_L0[ROUNDS512]; +__m128i ROUND_CONST_L7[ROUNDS512]; +//__m128i ROUND_CONST_P[ROUNDS1024]; +//__m128i ROUND_CONST_Q[ROUNDS1024]; +__m128i TRANSP_MASK; +__m128i SUBSH_MASK[8]; +__m128i ALL_1B; +__m128i ALL_FF; + + +#define tos(a) #a +#define tostr(a) tos(a) + + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + j = _mm_xor_si128(j, j);\ + j = _mm_cmpgt_epi8(j, i);\ + i = _mm_add_epi8(i, i);\ + j = _mm_and_si128(j, k);\ + i = _mm_xor_si128(i, j);\ +} + + /**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm_xor_si128(a0, a1);\ + b0 = a2;\ + a1 = _mm_xor_si128(a1, a2);\ + b1 = a3;\ + a2 = _mm_xor_si128(a2, a3);\ + b2 = a4;\ + a3 = _mm_xor_si128(a3, a4);\ + b3 = a5;\ + a4 = _mm_xor_si128(a4, a5);\ + b4 = a6;\ + a5 = _mm_xor_si128(a5, a6);\ + b5 = a7;\ + a6 = _mm_xor_si128(a6, a7);\ + a7 = _mm_xor_si128(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm_xor_si128(b0, a4);\ + b6 = _mm_xor_si128(b6, a4);\ + b1 = _mm_xor_si128(b1, a5);\ + b7 = _mm_xor_si128(b7, a5);\ + b2 = _mm_xor_si128(b2, a6);\ + b0 = _mm_xor_si128(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm_xor_si128(b3, a7);\ + b1 = _mm_xor_si128(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm_xor_si128(b4, a0);\ + b2 = _mm_xor_si128(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm_xor_si128(b5, a1);\ + b3 = _mm_xor_si128(b3, a1);\ + b1 = a1;\ + b6 = _mm_xor_si128(b6, a2);\ + b4 = _mm_xor_si128(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm_xor_si128(b7, a3);\ + b5 = _mm_xor_si128(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm_xor_si128(a0, a3);\ + a1 = _mm_xor_si128(a1, a4);\ + a2 = _mm_xor_si128(a2, a5);\ + a3 = _mm_xor_si128(a3, a6);\ + a4 = _mm_xor_si128(a4, a7);\ + a5 = _mm_xor_si128(a5, b0);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = ALL_1B;\ + MUL2(a0, b0, b1);\ + a0 = _mm_xor_si128(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm_xor_si128(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm_xor_si128(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm_xor_si128(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm_xor_si128(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm_xor_si128(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm_xor_si128(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm_xor_si128(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm_xor_si128(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm_xor_si128(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm_xor_si128(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm_xor_si128(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm_xor_si128(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm_xor_si128(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm_xor_si128(b0, a3);\ + b1 = _mm_xor_si128(b1, a4);\ +}/*MixBytes*/ + +#define SET_CONSTANTS(){\ + ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ + TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ + SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ + SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ + SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ + SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ + SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ + SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ + SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ + SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ + for(i = 0; i < ROUNDS512; i++)\ + {\ + ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ + ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ + }\ + ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ +}while(0); \ + +/* one round + * i = round number + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* AddRoundConstant */\ + b1 = ROUND_CONST_Lx;\ + a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ + a1 = _mm_xor_si128(a1, b1);\ + a2 = _mm_xor_si128(a2, b1);\ + a3 = _mm_xor_si128(a3, b1);\ + a4 = _mm_xor_si128(a4, b1);\ + a5 = _mm_xor_si128(a5, b1);\ + a6 = _mm_xor_si128(a6, b1);\ + a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + \ + /* ShiftBytes + SubBytes (interleaved) */\ + b0 = _mm_xor_si128(b0, b0);\ + a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ + a0 = _mm_aesenclast_si128(a0, b0);\ + a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ + a1 = _mm_aesenclast_si128(a1, b0);\ + a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ + a2 = _mm_aesenclast_si128(a2, b0);\ + a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ + a3 = _mm_aesenclast_si128(a3, b0);\ + a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ + a4 = _mm_aesenclast_si128(a4, b0);\ + a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ + a5 = _mm_aesenclast_si128(a5, b0);\ + a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ + a6 = _mm_aesenclast_si128(a6, b0);\ + a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ + a7 = _mm_aesenclast_si128(a7, b0);\ + \ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +\ +} + +/* 10 rounds, P and Q in parallel */ +#define ROUNDS_P_Q(){\ + ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ +} + +/* Matrix Transpose Step 1 + * input is a 512-bit state with two columns in one xmm + * output is a 512-bit state with two rows in one xmm + * inputs: i0-i3 + * outputs: i0, o1-o3 + * clobbers: t0 + */ +#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ + t0 = TRANSP_MASK;\ + \ + i0 = _mm_shuffle_epi8(i0, t0);\ + i1 = _mm_shuffle_epi8(i1, t0);\ + i2 = _mm_shuffle_epi8(i2, t0);\ + i3 = _mm_shuffle_epi8(i3, t0);\ + \ + o1 = i0;\ + t0 = i2;\ + \ + i0 = _mm_unpacklo_epi16(i0, i1);\ + o1 = _mm_unpackhi_epi16(o1, i1);\ + i2 = _mm_unpacklo_epi16(i2, i3);\ + t0 = _mm_unpackhi_epi16(t0, i3);\ + \ + i0 = _mm_shuffle_epi32(i0, 216);\ + o1 = _mm_shuffle_epi32(o1, 216);\ + i2 = _mm_shuffle_epi32(i2, 216);\ + t0 = _mm_shuffle_epi32(t0, 216);\ + \ + o2 = i0;\ + o3 = o1;\ + \ + i0 = _mm_unpacklo_epi32(i0, i2);\ + o1 = _mm_unpacklo_epi32(o1, t0);\ + o2 = _mm_unpackhi_epi32(o2, i2);\ + o3 = _mm_unpackhi_epi32(o3, t0);\ +}/**/ + +/* Matrix Transpose Step 2 + * input are two 512-bit states with two rows in one xmm + * output are two 512-bit states with one row of each state in one xmm + * inputs: i0-i3 = P, i4-i7 = Q + * outputs: (i0, o1-o7) = (P|Q) + * possible reassignments: (output reg = input reg) + * * i1 -> o3-7 + * * i2 -> o5-7 + * * i3 -> o7 + * * i4 -> o3-7 + * * i5 -> o6-7 + */ +#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ + o1 = i0;\ + o2 = i1;\ + i0 = _mm_unpacklo_epi64(i0, i4);\ + o1 = _mm_unpackhi_epi64(o1, i4);\ + o3 = i1;\ + o4 = i2;\ + o2 = _mm_unpacklo_epi64(o2, i5);\ + o3 = _mm_unpackhi_epi64(o3, i5);\ + o5 = i2;\ + o6 = i3;\ + o4 = _mm_unpacklo_epi64(o4, i6);\ + o5 = _mm_unpackhi_epi64(o5, i6);\ + o7 = i3;\ + o6 = _mm_unpacklo_epi64(o6, i7);\ + o7 = _mm_unpackhi_epi64(o7, i7);\ +}/**/ + +/* Matrix Transpose Inverse Step 2 + * input are two 512-bit states with one row of each state in one xmm + * output are two 512-bit states with two rows in one xmm + * inputs: i0-i7 = (P|Q) + * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q + */ +#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ + o0 = i0;\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + o0 = _mm_unpackhi_epi64(o0, i1);\ + o1 = i2;\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + o1 = _mm_unpackhi_epi64(o1, i3);\ + o2 = i4;\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + o2 = _mm_unpackhi_epi64(o2, i5);\ + o3 = i6;\ + i6 = _mm_unpacklo_epi64(i6, i7);\ + o3 = _mm_unpackhi_epi64(o3, i7);\ +}/**/ + +/* Matrix Transpose Output Step 2 + * input is one 512-bit state with two rows in one xmm + * output is one 512-bit state with one row in the low 64-bits of one xmm + * inputs: i0,i2,i4,i6 = S + * outputs: (i0-7) = (0|S) + */ +#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ + t0 = _mm_xor_si128(t0, t0);\ + i1 = i0;\ + i3 = i2;\ + i5 = i4;\ + i7 = i6;\ + i0 = _mm_unpacklo_epi64(i0, t0);\ + i1 = _mm_unpackhi_epi64(i1, t0);\ + i2 = _mm_unpacklo_epi64(i2, t0);\ + i3 = _mm_unpackhi_epi64(i3, t0);\ + i4 = _mm_unpacklo_epi64(i4, t0);\ + i5 = _mm_unpackhi_epi64(i5, t0);\ + i6 = _mm_unpacklo_epi64(i6, t0);\ + i7 = _mm_unpackhi_epi64(i7, t0);\ +}/**/ + +/* Matrix Transpose Output Inverse Step 2 + * input is one 512-bit state with one row in the low 64-bits of one xmm + * output is one 512-bit state with two rows in one xmm + * inputs: i0-i7 = (0|S) + * outputs: (i0, i2, i4, i6) = S + */ +#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ + i0 = _mm_unpacklo_epi64(i0, i1);\ + i2 = _mm_unpacklo_epi64(i2, i3);\ + i4 = _mm_unpacklo_epi64(i4, i5);\ + i6 = _mm_unpacklo_epi64(i6, i7);\ +}/**/ + + +void INIT256( __m128i* chaining ) +{ + static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; + static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm12 - xmm15 */ + xmm12 = chaining[0]; + xmm13 = chaining[1]; + xmm14 = chaining[2]; + xmm15 = chaining[3]; + + /* transform chaining value from column ordering into row ordering */ + /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* store transposed IV */ + chaining[0] = xmm12; + chaining[1] = xmm2; + chaining[2] = xmm6; + chaining[3] = xmm7; +} + +void TF512( __m128i* chaining, __m128i* message ) +{ + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + +#ifdef IACA_TRACE + IACA_START; +#endif + + /* load message into registers xmm12 - xmm15 */ + xmm12 = message[0]; + xmm13 = message[1]; + xmm14 = message[2]; + xmm15 = message[3]; + + /* transform message M from column ordering into row ordering */ + /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ + Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); + + /* load previous chaining value */ + /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ + xmm8 = chaining[0]; + xmm0 = chaining[1]; + xmm4 = chaining[2]; + xmm5 = chaining[3]; + + /* xor message to CV get input of P */ + /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ + xmm8 = _mm_xor_si128(xmm8, xmm12); + xmm0 = _mm_xor_si128(xmm0, xmm2); + xmm4 = _mm_xor_si128(xmm4, xmm6); + xmm5 = _mm_xor_si128(xmm5, xmm7); + + /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ + /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ + /* result: the 8 rows of P and Q in xmm8 - xmm12 */ + Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* compute the two permutations P and Q in parallel */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P or two rows of Q in one xmm register */ + Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); + + /* xor output of P and Q */ + /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm10); + xmm2 = _mm_xor_si128(xmm2, xmm12); + xmm3 = _mm_xor_si128(xmm3, xmm14); + + /* xor CV (feed-forward) */ + /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ + xmm0 = _mm_xor_si128(xmm0, (chaining[0])); + xmm1 = _mm_xor_si128(xmm1, (chaining[1])); + xmm2 = _mm_xor_si128(xmm2, (chaining[2])); + xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + + /* store CV */ + chaining[0] = xmm0; + chaining[1] = xmm1; + chaining[2] = xmm2; + chaining[3] = xmm3; + +#ifdef IACA_TRACE + IACA_END; +#endif + return; +} + +void OF512( __m128i* chaining ) +{ + static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m128i TEMP0; + static __m128i TEMP1; + static __m128i TEMP2; + + /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ + xmm8 = chaining[0]; + xmm10 = chaining[1]; + xmm12 = chaining[2]; + xmm14 = chaining[3]; + + /* there are now 2 rows of the CV in one xmm register */ + /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ + /* result: the 8 input rows of P in xmm8 - xmm15 */ + Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); + + /* compute the permutation P */ + /* result: the output of P(CV) in xmm8 - xmm15 */ + ROUNDS_P_Q(); + + /* unpack again to get two rows of P in one xmm register */ + /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ + Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ + xmm8 = _mm_xor_si128(xmm8, (chaining[0])); + xmm10 = _mm_xor_si128(xmm10, (chaining[1])); + xmm12 = _mm_xor_si128(xmm12, (chaining[2])); + xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + + /* transform state back from row ordering into column ordering */ + /* result: final hash value in xmm9, xmm11 */ + Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); + + /* we only need to return the truncated half of the state */ + chaining[2] = xmm9; + chaining[3] = xmm11; +} + + diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c new file mode 100644 index 0000000..96389f8 --- /dev/null +++ b/algo/groestl/groestl512-hash-4way.c @@ -0,0 +1,114 @@ +/* hash.c Aug 2011 + * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12. + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +// Optimized for hash and data length that are integrals of __m128i + + +#include +#include "groestl512-intr-4way.h" +#include "miner.h" +#include "simd-utils.h" + +#if defined(__VAES__) + +#define ROTL64(a,n) \ + ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff ) + +#define U64BIG(a) \ + ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \ + ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \ + ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \ + ( ROTL64(a,56) & 0xFF000000FF000000 ) ) + +int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) +{ + int i; + + ctx->hashlen = hashlen; + SET_CONSTANTS(); + + if (ctx->chaining == NULL || ctx->buffer == NULL) + return 1; + + for ( i = 0; i < SIZE512; i++ ) + { + ctx->chaining[i] = m512_zero; + ctx->buffer[i] = m512_zero; + } + + uint64_t len = U64BIG((uint64_t)LENGTH); + ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 ); + INIT_4way(ctx->chaining); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + return 0; +} + +int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, + const void* input, uint64_t databitlen ) +{ + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + int rem = ctx->rem_ptr; + int blocks = len / SIZE512; + __m512i* in = (__m512i*)input; + int i; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + + //--- final --- + + blocks++; // adjust for final block + + if ( i == SIZE512 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m512_const1_128( _mm_set_epi8( + blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + } + else + { + // add first padding + ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + // add zero padding + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = m512_zero; + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = m512_const1_128( _mm_set_epi8( + blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); + } + +// digest final padding block and do output transform + TF1024_4way( ctx->chaining, ctx->buffer ); + + OF1024_4way( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + +#endif // VAES + diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h new file mode 100644 index 0000000..ab3acc6 --- /dev/null +++ b/algo/groestl/groestl512-hash-4way.h @@ -0,0 +1,94 @@ +/* hash.h Aug 2011 + * + * Groestl implementation for different versions. + * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer + * + * This code is placed in the public domain + */ + +#if !defined(GROESTL512_HASH_4WAY_H__) +#define GROESTL512_HASH_4WAY_H__ 1 + +#include "simd-utils.h" +#include +#include +#include +#if defined(_WIN64) || defined(__WINDOWS__) +#include +#endif +#include + +#define LENGTH (512) + +//#include "brg_endian.h" +//#define NEED_UINT_64T +//#include "algo/sha/brg_types.h" + +/* some sizes (number of bytes) */ +#define ROWS (8) +#define LENGTHFIELDLEN (ROWS) +//#define COLS512 (8) +#define COLS1024 (16) +//#define SIZE512 ((ROWS)*(COLS512)) +#define SIZE_1024 ((ROWS)*(COLS1024)) +//#define ROUNDS512 (10) +#define ROUNDS1024 (14) + +//#if LENGTH<=256 +//#define COLS (COLS512) +//#define SIZE (SIZE512) +//#define ROUNDS (ROUNDS512) +//#else +#define COLS (COLS1024) +//#define SIZE (SIZE1024) +#define ROUNDS (ROUNDS1024) +//#endif + +/* +#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) +#define U64BIG(a) (a) +#endif // IS_BIG_ENDIAN + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define U64BIG(a) \ + ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ + (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ + (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ + (ROTL64(a,56) & li_64(FF000000FF000000))) +#endif // IS_LITTLE_ENDIAN + +typedef unsigned char BitSequence_gr; +typedef unsigned long long DataLength_gr; +typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr; +*/ + +#define SIZE512 (SIZE_1024/16) + +typedef struct { + __attribute__ ((aligned (128))) __m512i chaining[SIZE512]; + __attribute__ ((aligned (64))) __m512i buffer[SIZE512]; + int hashlen; // byte + int blk_count; // SIZE_m128i + int buf_ptr; // __m128i offset + int rem_ptr; + int databitlen; // bits +} groestl512_4way_context; + + +int groestl512_4way_init( groestl512_4way_context*, uint64_t ); + +//int reinit_groestl( hashState_groestl* ); + +int groestl512_4way_update( groestl512_4way_context*, const void*, + uint64_t ); + +int groestl512_4way_close( groestl512_4way_context*, void* ); + +int groestl512_4way_update_close( groestl512_4way_context*, void*, + const void*, uint64_t ); + +#endif /* __hash_h */ diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h new file mode 100644 index 0000000..e8c243a --- /dev/null +++ b/algo/groestl/groestl512-intr-4way.h @@ -0,0 +1,654 @@ +/* groestl-intr-aes.h Aug 2011 + * + * Groestl implementation with intrinsics using ssse3, sse4.1, and aes + * instructions. + * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz + * + * This code is placed in the public domain + */ + + +#if !defined(GROESTL512_INTR_4WAY_H__) +#define GROESTL512_INTR_4WAY_H__ 1 + +#include "groestl512-hash-4way.h" + +#if defined(__VAES__) + +/* global constants */ +__m512i ROUND_CONST_Lx; +//__m128i ROUND_CONST_L0[ROUNDS512]; +//__m128i ROUND_CONST_L7[ROUNDS512]; +__m512i ROUND_CONST_P[ROUNDS1024]; +__m512i ROUND_CONST_Q[ROUNDS1024]; +__m512i TRANSP_MASK; +__m512i SUBSH_MASK[8]; +__m512i ALL_1B; +__m512i ALL_FF; + +#define tos(a) #a +#define tostr(a) tos(a) + +/* xmm[i] will be multiplied by 2 + * xmm[j] will be lost + * xmm[k] has to be all 0x1b */ +#define MUL2(i, j, k){\ + j = _mm512_xor_si512(j, j);\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + i = _mm512_add_epi8(i, i);\ + j = _mm512_and_si512(j, k);\ + i = _mm512_xor_si512(i, j);\ +} + + /**/ + +/* Yet another implementation of MixBytes. + This time we use the formulae (3) from the paper "Byte Slicing Groestl". + Input: a0, ..., a7 + Output: b0, ..., b7 = MixBytes(a0,...,a7). + but we use the relations: + t_i = a_i + a_{i+3} + x_i = t_i + t_{i+3} + y_i = t_i + t+{i+2} + a_{i+6} + z_i = 2*x_i + w_i = z_i + y_{i+4} + v_i = 2*w_i + b_i = v_{i+3} + y_{i+4} + We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there + and then adding v_i computed in the meantime in registers xmm0..xmm7. + We almost fit into 16 registers, need only 3 spills to memory. + This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. + K. Matusiewicz, 2011/05/29 */ +#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* t_i = a_i + a_{i+1} */\ + b6 = a0;\ + b7 = a1;\ + a0 = _mm512_xor_si512(a0, a1);\ + b0 = a2;\ + a1 = _mm512_xor_si512(a1, a2);\ + b1 = a3;\ + a2 = _mm512_xor_si512(a2, a3);\ + b2 = a4;\ + a3 = _mm512_xor_si512(a3, a4);\ + b3 = a5;\ + a4 = _mm512_xor_si512(a4, a5);\ + b4 = a6;\ + a5 = _mm512_xor_si512(a5, a6);\ + b5 = a7;\ + a6 = _mm512_xor_si512(a6, a7);\ + a7 = _mm512_xor_si512(a7, b6);\ + \ + /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ + b0 = _mm512_xor_si512(b0, a4);\ + b6 = _mm512_xor_si512(b6, a4);\ + b1 = _mm512_xor_si512(b1, a5);\ + b7 = _mm512_xor_si512(b7, a5);\ + b2 = _mm512_xor_si512(b2, a6);\ + b0 = _mm512_xor_si512(b0, a6);\ + /* spill values y_4, y_5 to memory */\ + TEMP0 = b0;\ + b3 = _mm512_xor_si512(b3, a7);\ + b1 = _mm512_xor_si512(b1, a7);\ + TEMP1 = b1;\ + b4 = _mm512_xor_si512(b4, a0);\ + b2 = _mm512_xor_si512(b2, a0);\ + /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ + b0 = a0;\ + b5 = _mm512_xor_si512(b5, a1);\ + b3 = _mm512_xor_si512(b3, a1);\ + b1 = a1;\ + b6 = _mm512_xor_si512(b6, a2);\ + b4 = _mm512_xor_si512(b4, a2);\ + TEMP2 = a2;\ + b7 = _mm512_xor_si512(b7, a3);\ + b5 = _mm512_xor_si512(b5, a3);\ + \ + /* compute x_i = t_i + t_{i+3} */\ + a0 = _mm512_xor_si512(a0, a3);\ + a1 = _mm512_xor_si512(a1, a4);\ + a2 = _mm512_xor_si512(a2, a5);\ + a3 = _mm512_xor_si512(a3, a6);\ + a4 = _mm512_xor_si512(a4, a7);\ + a5 = _mm512_xor_si512(a5, b0);\ + a6 = _mm512_xor_si512(a6, b1);\ + a7 = _mm512_xor_si512(a7, TEMP2);\ + \ + /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ + /* compute w_i : add y_{i+4} */\ + b1 = ALL_1B;\ + MUL2(a0, b0, b1);\ + a0 = _mm512_xor_si512(a0, TEMP0);\ + MUL2(a1, b0, b1);\ + a1 = _mm512_xor_si512(a1, TEMP1);\ + MUL2(a2, b0, b1);\ + a2 = _mm512_xor_si512(a2, b2);\ + MUL2(a3, b0, b1);\ + a3 = _mm512_xor_si512(a3, b3);\ + MUL2(a4, b0, b1);\ + a4 = _mm512_xor_si512(a4, b4);\ + MUL2(a5, b0, b1);\ + a5 = _mm512_xor_si512(a5, b5);\ + MUL2(a6, b0, b1);\ + a6 = _mm512_xor_si512(a6, b6);\ + MUL2(a7, b0, b1);\ + a7 = _mm512_xor_si512(a7, b7);\ + \ + /* compute v_i : double w_i */\ + /* add to y_4 y_5 .. v3, v4, ... */\ + MUL2(a0, b0, b1);\ + b5 = _mm512_xor_si512(b5, a0);\ + MUL2(a1, b0, b1);\ + b6 = _mm512_xor_si512(b6, a1);\ + MUL2(a2, b0, b1);\ + b7 = _mm512_xor_si512(b7, a2);\ + MUL2(a5, b0, b1);\ + b2 = _mm512_xor_si512(b2, a5);\ + MUL2(a6, b0, b1);\ + b3 = _mm512_xor_si512(b3, a6);\ + MUL2(a7, b0, b1);\ + b4 = _mm512_xor_si512(b4, a7);\ + MUL2(a3, b0, b1);\ + MUL2(a4, b0, b1);\ + b0 = TEMP0;\ + b1 = TEMP1;\ + b0 = _mm512_xor_si512(b0, a3);\ + b1 = _mm512_xor_si512(b1, a4);\ +}/*MixBytes*/ + +// calculate the round constants seperately and load at startup + +#define SET_CONSTANTS(){\ + ALL_FF = _mm512_set1_epi32( 0xffffffff );\ + ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\ + TRANSP_MASK = _mm512_set_epi32( \ + 0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \ + 0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \ + 0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \ + 0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \ + SUBSH_MASK[0] = _mm512_set_epi32( \ + 0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \ + 0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \ + 0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \ + 0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \ + SUBSH_MASK[1] = _mm512_set_epi32( \ + 0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \ + 0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \ + 0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \ + 0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \ + SUBSH_MASK[2] = _mm512_set_epi32( \ + 0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \ + 0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \ + 0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \ + 0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \ + SUBSH_MASK[3] = _mm512_set_epi32( \ + 0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \ + 0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \ + 0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \ + 0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \ + SUBSH_MASK[4] = _mm512_set_epi32( \ + 0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \ + 0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \ + 0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \ + 0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \ + SUBSH_MASK[5] = _mm512_set_epi32( \ + 0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \ + 0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \ + 0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \ + 0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \ + SUBSH_MASK[6] = _mm512_set_epi32( \ + 0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \ + 0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \ + 0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \ + 0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \ + SUBSH_MASK[7] = _mm512_set_epi32( \ + 0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \ + 0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \ + 0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \ + 0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \ + for( i = 0; i < ROUNDS1024; i++ ) \ + { \ + ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \ + 0xb0a09080 ^ (i * 0x01010101), \ + 0x70605040 ^ (i * 0x01010101), \ + 0x30201000 ^ (i * 0x01010101) ); \ + ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \ + 0x4f5f6f7f ^ (i * 0x01010101), \ + 0x8f9fafbf ^ (i * 0x01010101), \ + 0xcfdfefff ^ (i * 0x01010101));\ + } \ +}while(0);\ + +/* one round + * a0-a7 = input rows + * b0-b7 = output rows + */ +#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ + /* SubBytes */\ + b0 = _mm512_xor_si512( b0, b0 );\ + a0 = _mm512_aesenclast_epi128( a0, b0 );\ + a1 = _mm512_aesenclast_epi128( a1, b0 );\ + a2 = _mm512_aesenclast_epi128( a2, b0 );\ + a3 = _mm512_aesenclast_epi128( a3, b0 );\ + a4 = _mm512_aesenclast_epi128( a4, b0 );\ + a5 = _mm512_aesenclast_epi128( a5, b0 );\ + a6 = _mm512_aesenclast_epi128( a6, b0 );\ + a7 = _mm512_aesenclast_epi128( a7, b0 );\ + /* MixBytes */\ + MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ +} + +#define ROUNDS_P(){\ + uint8_t round_counter = 0;\ + for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \ + { \ + /* AddRoundConstant P1024 */\ + xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[0] ) );\ + xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[1] ) );\ + xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\ + xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\ + xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\ + xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\ + xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\ + xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant P1024 */\ + xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\ + /* ShiftBytes P1024 + pre-AESENCLAST */\ + xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\ + xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\ + xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\ + xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\ + xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\ + xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\ + xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\ + xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +#define ROUNDS_Q(){\ + uint8_t round_counter = 0;\ + for ( round_counter = 0; round_counter < 14; round_counter += 2) \ + { \ + /* AddRoundConstant Q1024 */\ + xmm1 = ALL_FF;\ + xmm8 = _mm512_xor_si512( xmm8, xmm1 );\ + xmm9 = _mm512_xor_si512( xmm9, xmm1 );\ + xmm10 = _mm512_xor_si512( xmm10, xmm1 );\ + xmm11 = _mm512_xor_si512( xmm11, xmm1 );\ + xmm12 = _mm512_xor_si512( xmm12, xmm1 );\ + xmm13 = _mm512_xor_si512( xmm13, xmm1 );\ + xmm14 = _mm512_xor_si512( xmm14, xmm1 );\ + xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[1] ) );\ + xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[3] ) );\ + xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\ + xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\ + xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\ + xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\ + xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\ + xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + \ + /* AddRoundConstant Q1024 */\ + xmm9 = ALL_FF;\ + xmm0 = _mm512_xor_si512( xmm0, xmm9 );\ + xmm1 = _mm512_xor_si512( xmm1, xmm9 );\ + xmm2 = _mm512_xor_si512( xmm2, xmm9 );\ + xmm3 = _mm512_xor_si512( xmm3, xmm9 );\ + xmm4 = _mm512_xor_si512( xmm4, xmm9 );\ + xmm5 = _mm512_xor_si512( xmm5, xmm9 );\ + xmm6 = _mm512_xor_si512( xmm6, xmm9 );\ + xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\ + /* ShiftBytes Q1024 + pre-AESENCLAST */\ + xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\ + xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\ + xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\ + xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\ + xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\ + xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\ + xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\ + xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\ + /* SubBytes + MixBytes */\ + SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + }\ +} + +/* Matrix Transpose + * input is a 1024-bit state with two columns in one xmm + * output is a 1024-bit state with two rows in one xmm + * inputs: i0-i7 + * outputs: i0-i7 + * clobbers: t0-t7 + */ +#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ + t0 = TRANSP_MASK;\ +\ + i6 = _mm512_shuffle_epi8(i6, t0);\ + i0 = _mm512_shuffle_epi8(i0, t0);\ + i1 = _mm512_shuffle_epi8(i1, t0);\ + i2 = _mm512_shuffle_epi8(i2, t0);\ + i3 = _mm512_shuffle_epi8(i3, t0);\ + t1 = i2;\ + i4 = _mm512_shuffle_epi8(i4, t0);\ + i5 = _mm512_shuffle_epi8(i5, t0);\ + t2 = i4;\ + t3 = i6;\ + i7 = _mm512_shuffle_epi8(i7, t0);\ +\ + /* continue with unpack using 4 temp registers */\ + t0 = i0;\ + t2 = _mm512_unpackhi_epi16(t2, i5);\ + i4 = _mm512_unpacklo_epi16(i4, i5);\ + t3 = _mm512_unpackhi_epi16(t3, i7);\ + i6 = _mm512_unpacklo_epi16(i6, i7);\ + t0 = _mm512_unpackhi_epi16(t0, i1);\ + t1 = _mm512_unpackhi_epi16(t1, i3);\ + i2 = _mm512_unpacklo_epi16(i2, i3);\ + i0 = _mm512_unpacklo_epi16(i0, i1);\ +\ + /* shuffle with immediate */\ + t0 = _mm512_shuffle_epi32(t0, 216);\ + t1 = _mm512_shuffle_epi32(t1, 216);\ + t2 = _mm512_shuffle_epi32(t2, 216);\ + t3 = _mm512_shuffle_epi32(t3, 216);\ + i0 = _mm512_shuffle_epi32(i0, 216);\ + i2 = _mm512_shuffle_epi32(i2, 216);\ + i4 = _mm512_shuffle_epi32(i4, 216);\ + i6 = _mm512_shuffle_epi32(i6, 216);\ +\ + /* continue with unpack */\ + t4 = i0;\ + i0 = _mm512_unpacklo_epi32(i0, i2);\ + t4 = _mm512_unpackhi_epi32(t4, i2);\ + t5 = t0;\ + t0 = _mm512_unpacklo_epi32(t0, t1);\ + t5 = _mm512_unpackhi_epi32(t5, t1);\ + t6 = i4;\ + i4 = _mm512_unpacklo_epi32(i4, i6);\ + t7 = t2;\ + t6 = _mm512_unpackhi_epi32(t6, i6);\ + i2 = t0;\ + t2 = _mm512_unpacklo_epi32(t2, t3);\ + i3 = t0;\ + t7 = _mm512_unpackhi_epi32(t7, t3);\ +\ + /* there are now 2 rows in each xmm */\ + /* unpack to get 1 row of CV in each xmm */\ + i1 = i0;\ + i1 = _mm512_unpackhi_epi64(i1, i4);\ + i0 = _mm512_unpacklo_epi64(i0, i4);\ + i4 = t4;\ + i3 = _mm512_unpackhi_epi64(i3, t2);\ + i5 = t4;\ + i2 = _mm512_unpacklo_epi64(i2, t2);\ + i6 = t5;\ + i5 = _mm512_unpackhi_epi64(i5, t6);\ + i7 = t5;\ + i4 = _mm512_unpacklo_epi64(i4, t6);\ + i7 = _mm512_unpackhi_epi64(i7, t7);\ + i6 = _mm512_unpacklo_epi64(i6, t7);\ + /* transpose done */\ +}/**/ + +/* Matrix Transpose Inverse + * input is a 1024-bit state with two rows in one xmm + * output is a 1024-bit state with two columns in one xmm + * inputs: i0-i7 + * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) + * clobbers: t0-t4 + */ +#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ + /* transpose matrix to get output format */\ + o1 = i0;\ + i0 = _mm512_unpacklo_epi64(i0, i1);\ + o1 = _mm512_unpackhi_epi64(o1, i1);\ + t0 = i2;\ + i2 = _mm512_unpacklo_epi64(i2, i3);\ + t0 = _mm512_unpackhi_epi64(t0, i3);\ + t1 = i4;\ + i4 = _mm512_unpacklo_epi64(i4, i5);\ + t1 = _mm512_unpackhi_epi64(t1, i5);\ + t2 = i6;\ + o0 = TRANSP_MASK;\ + i6 = _mm512_unpacklo_epi64(i6, i7);\ + t2 = _mm512_unpackhi_epi64(t2, i7);\ + /* load transpose mask into a register, because it will be used 8 times */\ + i0 = _mm512_shuffle_epi8(i0, o0);\ + i2 = _mm512_shuffle_epi8(i2, o0);\ + i4 = _mm512_shuffle_epi8(i4, o0);\ + i6 = _mm512_shuffle_epi8(i6, o0);\ + o1 = _mm512_shuffle_epi8(o1, o0);\ + t0 = _mm512_shuffle_epi8(t0, o0);\ + t1 = _mm512_shuffle_epi8(t1, o0);\ + t2 = _mm512_shuffle_epi8(t2, o0);\ + /* continue with unpack using 4 temp registers */\ + t3 = i4;\ + o2 = o1;\ + o0 = i0;\ + t4 = t1;\ + \ + t3 = _mm512_unpackhi_epi16(t3, i6);\ + i4 = _mm512_unpacklo_epi16(i4, i6);\ + o0 = _mm512_unpackhi_epi16(o0, i2);\ + i0 = _mm512_unpacklo_epi16(i0, i2);\ + o2 = _mm512_unpackhi_epi16(o2, t0);\ + o1 = _mm512_unpacklo_epi16(o1, t0);\ + t4 = _mm512_unpackhi_epi16(t4, t2);\ + t1 = _mm512_unpacklo_epi16(t1, t2);\ + /* shuffle with immediate */\ + i4 = _mm512_shuffle_epi32(i4, 216);\ + t3 = _mm512_shuffle_epi32(t3, 216);\ + o1 = _mm512_shuffle_epi32(o1, 216);\ + o2 = _mm512_shuffle_epi32(o2, 216);\ + i0 = _mm512_shuffle_epi32(i0, 216);\ + o0 = _mm512_shuffle_epi32(o0, 216);\ + t1 = _mm512_shuffle_epi32(t1, 216);\ + t4 = _mm512_shuffle_epi32(t4, 216);\ + /* continue with unpack */\ + i1 = i0;\ + i3 = o0;\ + i5 = o1;\ + i7 = o2;\ + i0 = _mm512_unpacklo_epi32(i0, i4);\ + i1 = _mm512_unpackhi_epi32(i1, i4);\ + o0 = _mm512_unpacklo_epi32(o0, t3);\ + i3 = _mm512_unpackhi_epi32(i3, t3);\ + o1 = _mm512_unpacklo_epi32(o1, t1);\ + i5 = _mm512_unpackhi_epi32(i5, t1);\ + o2 = _mm512_unpacklo_epi32(o2, t4);\ + i7 = _mm512_unpackhi_epi32(i7, t4);\ + /* transpose done */\ +}/**/ + + +void INIT_4way( __m512i* chaining ) +{ + static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + + /* load IV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* transform chaining value from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store transposed IV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; +} + +void TF1024_4way( __m512i* chaining, const __m512i* message ) +{ + static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m512i QTEMP[8]; + static __m512i TEMP0; + static __m512i TEMP1; + static __m512i TEMP2; + + /* load message into registers xmm8 - xmm15 (Q = message) */ + xmm8 = message[0]; + xmm9 = message[1]; + xmm10 = message[2]; + xmm11 = message[3]; + xmm12 = message[4]; + xmm13 = message[5]; + xmm14 = message[6]; + xmm15 = message[7]; + + /* transform message M from column ordering into row ordering */ + Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); + + /* store message M (Q input) for later */ + QTEMP[0] = xmm8; + QTEMP[1] = xmm9; + QTEMP[2] = xmm10; + QTEMP[3] = xmm11; + QTEMP[4] = xmm12; + QTEMP[5] = xmm13; + QTEMP[6] = xmm14; + QTEMP[7] = xmm15; + + /* xor CV to message to get P input */ + /* result: CV+M in xmm8...xmm15 */ + xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) ); + xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) ); + xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) ); + xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) ); + xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) ); + xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) ); + xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) ); + xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) ); + + /* compute permutation P */ + /* result: P(CV+M) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV+M)+CV in xmm8...xmm15 */ + xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) ); + xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) ); + xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) ); + xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) ); + xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) ); + xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) ); + xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) ); + xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) ); + + /* store P(CV+M)+CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + /* load message M (Q input) into xmm8-15 */ + xmm8 = QTEMP[0]; + xmm9 = QTEMP[1]; + xmm10 = QTEMP[2]; + xmm11 = QTEMP[3]; + xmm12 = QTEMP[4]; + xmm13 = QTEMP[5]; + xmm14 = QTEMP[6]; + xmm15 = QTEMP[7]; + + /* compute permutation Q */ + /* result: Q(M) in xmm8...xmm15 */ + ROUNDS_Q(); + + /* xor Q output */ + /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ + xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) ); + xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) ); + xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) ); + xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) ); + xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) ); + xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) ); + xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) ); + xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) ); + + /* store CV */ + chaining[0] = xmm8; + chaining[1] = xmm9; + chaining[2] = xmm10; + chaining[3] = xmm11; + chaining[4] = xmm12; + chaining[5] = xmm13; + chaining[6] = xmm14; + chaining[7] = xmm15; + + return; +} + +void OF1024_4way( __m512i* chaining ) +{ + static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m512i TEMP0; + static __m512i TEMP1; + static __m512i TEMP2; + + /* load CV into registers xmm8 - xmm15 */ + xmm8 = chaining[0]; + xmm9 = chaining[1]; + xmm10 = chaining[2]; + xmm11 = chaining[3]; + xmm12 = chaining[4]; + xmm13 = chaining[5]; + xmm14 = chaining[6]; + xmm15 = chaining[7]; + + /* compute permutation P */ + /* result: P(CV) in xmm8...xmm15 */ + ROUNDS_P(); + + /* xor CV to P output (feed-forward) */ + /* result: P(CV)+CV in xmm8...xmm15 */ + xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) ); + xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) ); + xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) ); + xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) ); + xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) ); + xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) ); + xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) ); + xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) ); + + /* transpose CV back from row ordering to column ordering */ + /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ + Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); + + /* we only need to return the truncated half of the state */ + chaining[4] = xmm0; + chaining[5] = xmm6; + chaining[6] = xmm13; + chaining[7] = xmm15; + + return; +} + +#endif // VAES +#endif // GROESTL512_INTR_4WAY_H__ diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 17f0cf1..7f8d3ba 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -1,14 +1,159 @@ #include "myrgr-gate.h" - -#if defined(MYRGR_4WAY) - #include #include #include #include - #include "aes_ni/hash-groestl.h" #include "algo/sha/sha-hash-4way.h" +#if defined(__VAES__) + #include "groestl512-hash-4way.h" +#endif + +#if defined(MYRGR_8WAY) + +typedef struct { +#if defined(__VAES__) + groestl512_4way_context groestl; +#else + hashState_groestl groestl; +#endif + sha256_8way_context sha; +} myrgr_8way_ctx_holder; + +myrgr_8way_ctx_holder myrgr_8way_ctx; + +void init_myrgr_8way_ctx() +{ +#if defined(__VAES__) + groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 ); +#else + init_groestl( &myrgr_8way_ctx.groestl, 64 ); +#endif + sha256_8way_init( &myrgr_8way_ctx.sha ); +} + +void myriad_8way_hash( void *output, const void *input ) +{ + uint32_t vhash[16*8] __attribute__ ((aligned (128))); + uint32_t vhashA[20*8] __attribute__ ((aligned (64))); + uint32_t vhashB[20*8] __attribute__ ((aligned (64))); + myrgr_8way_ctx_holder ctx; + memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, input, 640 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 ); + + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t hash4[20] __attribute__ ((aligned (64))); + uint32_t hash5[20] __attribute__ ((aligned (64))); + uint32_t hash6[20] __attribute__ ((aligned (64))); + uint32_t hash7[20] __attribute__ ((aligned (64))); + +// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7 ); + +#else + + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t hash4[20] __attribute__ ((aligned (64))); + uint32_t hash5[20] __attribute__ ((aligned (64))); + uint32_t hash6[20] __attribute__ ((aligned (64))); + uint32_t hash7[20] __attribute__ ((aligned (64))); + + dintrlv_8x64( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, input, 640 ); + + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 ); + memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) ); + + intrlv_8x32( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, 512 ); + +#endif + + sha256_8way_update( &ctx.sha, vhash, 64 ); + sha256_8way_close( &ctx.sha, output ); +} + +int scanhash_myriad_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; + uint32_t *noncep = vdata + 64+3; // 4*16 + 3 + int thr_id = mythr->id; // thr_id arg is deprecated + + if ( opt_benchmark ) + ( (uint32_t*)ptarget )[7] = 0x0000ff; + + mm512_bswap32_intrlv80_4x128( vdata, pdata ); + + do + { + be32enc( noncep, n ); + be32enc( noncep+ 8, n+1 ); + be32enc( noncep+16, n+2 ); + be32enc( noncep+24, n+3 ); + be32enc( noncep+32, n+4 ); + be32enc( noncep+40, n+5 ); + be32enc( noncep+48, n+6 ); + be32enc( noncep+64, n+7 ); + + myriad_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash7[ lane ] <= Htarg ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( (n < last_nonce) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(MYRGR_4WAY) typedef struct { hashState_groestl groestl; @@ -45,7 +190,7 @@ void myriad_4way_hash( void *output, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - sha256_4way( &ctx.sha, vhash, 64 ); + sha256_4way_update( &ctx.sha, vhash, 64 ); sha256_4way_close( &ctx.sha, output ); } diff --git a/algo/groestl/myrgr-gate.c b/algo/groestl/myrgr-gate.c index 7f8e185..f82aafb 100644 --- a/algo/groestl/myrgr-gate.c +++ b/algo/groestl/myrgr-gate.c @@ -2,16 +2,22 @@ bool register_myriad_algo( algo_gate_t* gate ) { -#if defined (MYRGR_4WAY) +#if defined (MYRGR_8WAY) + init_myrgr_8way_ctx(); + gate->scanhash = (void*)&scanhash_myriad_8way; + gate->hash = (void*)&myriad_8way_hash; + gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT; +#elif defined (MYRGR_4WAY) init_myrgr_4way_ctx(); gate->scanhash = (void*)&scanhash_myriad_4way; gate->hash = (void*)&myriad_4way_hash; + gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT; #else init_myrgr_ctx(); gate->scanhash = (void*)&scanhash_myriad; gate->hash = (void*)&myriad_hash; + gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT; #endif - gate->optimizations = AES_OPT | AVX2_OPT; return true; }; diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h index 706bdb7..80cc3fd 100644 --- a/algo/groestl/myrgr-gate.h +++ b/algo/groestl/myrgr-gate.h @@ -1,30 +1,35 @@ #ifndef MYRGR_GATE_H__ -#define MYRGR_GATE_H__ +#define MYRGR_GATE_H__ 1 #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__) - #define MYRGR_4WAY +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define MYRGR_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__) + #define MYRGR_4WAY 1 #endif -#if defined(MYRGR_4WAY) +#if defined(MYRGR_8WAY) + +void myriad_8way_hash( void *state, const void *input ); +int scanhash_myriad_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_myrgr_8way_ctx(); + +#elif defined(MYRGR_4WAY) void myriad_4way_hash( void *state, const void *input ); - int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_myrgr_4way_ctx(); -#endif +#else void myriad_hash( void *state, const void *input ); - int scanhash_myriad( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); - void init_myrgr_ctx(); #endif - +#endif diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 0a1e6e2..d86bd42 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -1171,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc ) sc->h[7] = m256_const1_64( 0x6769756d2042656c ); } -void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len ) +void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data, + size_t len ) { __m256i *vdata = (__m256i*)data; diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index 4e57f10..60e33b2 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context; void hamsi512_4way_init( hamsi512_4way_context *sc ); void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data, size_t len ); -#define hamsi512_4way hamsi512_4way_update +//#define hamsi512_4way hamsi512_4way_update void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c index c9e7ad8..313b23f 100644 --- a/algo/haval/haval-4way-helper.c +++ b/algo/haval/haval-4way-helper.c @@ -38,7 +38,7 @@ #define SPH_XCAT_(a, b) a ## b static void -SPH_XCAT(SPH_XCAT(haval, PASSES), _4way) +SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update) ( haval_4way_context *sc, const void *data, size_t len ) { __m128i *vdata = (__m128i*)data; diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index 02df40f..6b45e10 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \ } \ \ void \ -haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \ +haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \ { \ - haval ## y ## _4way(cc, data, len); \ + haval ## y ## _4way_update(cc, data, len); \ } \ \ void \ diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index 9bd37ba..9164d2f 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -85,7 +85,7 @@ typedef haval_4way_context haval256_5_4way_context; void haval256_5_4way_init( void *cc ); void haval256_5_4way_update( void *cc, const void *data, size_t len ); -#define haval256_5_4way haval256_5_4way_update +//#define haval256_5_4way haval256_5_4way_update void haval256_5_4way_close( void *cc, void *dst ); diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h index 5cccebd..562fd5e 100644 --- a/algo/jh/jh-hash-4way.h +++ b/algo/jh/jh-hash-4way.h @@ -103,14 +103,12 @@ typedef jh_4way_context jh512_4way_context; void jh256_4way_init( jh_4way_context *sc); void jh256_4way_update(void *cc, const void *data, size_t len); -#define jh256_4way jh256_4way_update void jh256_4way_close(void *cc, void *dst); void jh512_4way_init( jh_4way_context *sc ); void jh512_4way_update(void *cc, const void *data, size_t len); -#define jh512_4way jh512_4way_update void jh512_4way_close(void *cc, void *dst); diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c index 2c76a33..68ffe7f 100644 --- a/algo/jh/jha-4way.c +++ b/algo/jh/jha-4way.c @@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input ) keccak512_4way_context ctx_keccak; keccak512_4way_init( &ctx_keccak ); - keccak512_4way( &ctx_keccak, input, 80 ); + keccak512_4way_update( &ctx_keccak, input, 80 ); keccak512_4way_close( &ctx_keccak, vhash ); // Heavy & Light Pair Loop @@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input ) intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, vhash, 64 ); + skein512_4way_update( &ctx_skein, vhash, 64 ); skein512_4way_close( &ctx_skein, vhashB ); for ( int i = 0; i < 8; i++ ) @@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input ) blake512_4way_close( &ctx_blake, vhashA ); jh512_4way_init( &ctx_jh ); - jh512_4way( &ctx_jh, vhash, 64 ); + jh512_4way_update( &ctx_jh, vhash, 64 ); jh512_4way_close( &ctx_jh, vhashB ); for ( int i = 0; i < 8; i++ ) diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index d8500a6..a353856 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -99,14 +99,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context; void keccak256_4way_init(void *cc); void keccak256_4way_update(void *cc, const void *data, size_t len); void keccak256_4way_close(void *cc, void *dst); -#define keccak256_4way keccak256_4way_update void keccak512_4way_init(void *cc); void keccak512_4way_update(void *cc, const void *data, size_t len); void keccak512_4way_close(void *cc, void *dst); void keccak512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); -#define keccak512_4way keccak512_4way_update #endif diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index a02d0f1..c06f813 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -55,7 +55,6 @@ void allium_8way_hash( void *state, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); - intrlv_2x256( vhash, hash0, hash1, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash0, hash1, vhash, 256 ); @@ -69,19 +68,6 @@ void allium_8way_hash( void *state, const void *input ) LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash6, hash7, vhash, 256 ); -/* - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 ); - LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 ); - LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); - LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); -*/ - - - intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 ); @@ -105,20 +91,6 @@ void allium_8way_hash( void *state, const void *input ) LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash6, hash7, vhash, 256 ); - -/* - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 ); - LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 ); - LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); - LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); -*/ - - - intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); @@ -232,11 +204,11 @@ void allium_4way_hash( void *state, const void *input ) allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) ); - blake256_4way( &ctx.blake, input + (64<<2), 16 ); + blake256_4way_update( &ctx.blake, input + (64<<2), 16 ); blake256_4way_close( &ctx.blake, vhash32 ); rintrlv_4x32_4x64( vhash64, vhash32, 256 ); - keccak256_4way( &ctx.keccak, vhash64, 32 ); + keccak256_4way_update( &ctx.keccak, vhash64, 32 ); keccak256_4way_close( &ctx.keccak, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); @@ -261,7 +233,7 @@ void allium_4way_hash( void *state, const void *input ) intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - skein256_4way( &ctx.skein, vhash64, 32 ); + skein256_4way_update( &ctx.skein, vhash64, 32 ); skein256_4way_close( &ctx.skein, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index 4f9cc1a..caa6fb0 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_phi2_algo( algo_gate_t* gate ) { // init_phi2_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; gate->get_work_data_size = (void*)&phi2_get_work_data_size; gate->decode_extra_data = (void*)&phi2_decode_extra_data; gate->build_extraheader = (void*)&phi2_build_extraheader; diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c index a76e68c..b86f514 100644 --- a/algo/lyra2/lyra2h-4way.c +++ b/algo/lyra2/lyra2h-4way.c @@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid; void lyra2h_4way_midstate( const void* input ) { blake256_4way_init( &l2h_4way_blake_mid ); - blake256_4way( &l2h_4way_blake_mid, input, 64 ); + blake256_4way_update( &l2h_4way_blake_mid, input, 64 ); } void lyra2h_4way_hash( void *state, const void *input ) diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index f2954c3..0ed53c5 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -44,7 +44,7 @@ void lyra2rev2_8way_hash( void *state, const void *input ) lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) ); - blake256_8way( &ctx.blake, input + (64<<3), 16 ); + blake256_8way_update( &ctx.blake, input + (64<<3), 16 ); blake256_8way_close( &ctx.blake, vhash ); rintrlv_8x32_8x64( vhashA, vhash, 256 ); @@ -176,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input ) lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) ); - blake256_4way( &ctx.blake, input + (64<<2), 16 ); + blake256_4way_update( &ctx.blake, input + (64<<2), 16 ); blake256_4way_close( &ctx.blake, vhash ); rintrlv_4x32_4x64( vhash64, vhash, 256 ); - keccak256_4way( &ctx.keccak, vhash64, 32 ); + keccak256_4way_update( &ctx.keccak, vhash64, 32 ); keccak256_4way_close( &ctx.keccak, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); @@ -201,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input ) intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - skein256_4way( &ctx.skein, vhash64, 32 ); + skein256_4way_update( &ctx.skein, vhash64, 32 ); skein256_4way_close( &ctx.skein, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); @@ -217,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - bmw256_4way( &ctx.bmw, vhash, 32 ); + bmw256_4way_update( &ctx.bmw, vhash, 32 ); bmw256_4way_close( &ctx.bmw, state ); } @@ -242,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake256_4way_init( &l2v2_4way_ctx.blake ); - blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 ); + blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 ); do { diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 6e560be..a7a9a3c 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -209,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input ) lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) ); - blake256_8way( &ctx.blake, input + (64*8), 16 ); + blake256_8way_update( &ctx.blake, input + (64*8), 16 ); blake256_8way_close( &ctx.blake, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, @@ -252,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input ) intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); - bmw256_8way( &ctx.bmw, vhash, 32 ); + bmw256_8way_update( &ctx.bmw, vhash, 32 ); bmw256_8way_close( &ctx.bmw, state ); } @@ -277,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); blake256_8way_init( &l2v3_8way_ctx.blake ); - blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 ); + blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 ); do { @@ -334,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input ) lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) ); -// blake256_4way( &ctx.blake, input, 80 ); - blake256_4way( &ctx.blake, input + (64*4), 16 ); + blake256_4way_update( &ctx.blake, input + (64*4), 16 ); blake256_4way_close( &ctx.blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); @@ -358,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input ) LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - bmw256_4way( &ctx.bmw, vhash, 32 ); + bmw256_4way_update( &ctx.bmw, vhash, 32 ); bmw256_4way_close( &ctx.bmw, state ); } @@ -383,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake256_4way_init( &l2v3_4way_ctx.blake ); - blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 ); + blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 ); do { diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 3f5e56e..7273ebe 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -149,7 +149,7 @@ static __thread blake256_8way_context l2z_8way_blake_mid; void lyra2z_8way_midstate( const void* input ) { blake256_8way_init( &l2z_8way_blake_mid ); - blake256_8way( &l2z_8way_blake_mid, input, 64 ); + blake256_8way_update( &l2z_8way_blake_mid, input, 64 ); } void lyra2z_8way_hash( void *state, const void *input ) @@ -166,7 +166,7 @@ void lyra2z_8way_hash( void *state, const void *input ) blake256_8way_context ctx_blake __attribute__ ((aligned (64))); memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid ); - blake256_8way( &ctx_blake, input + (64*8), 16 ); + blake256_8way_update( &ctx_blake, input + (64*8), 16 ); blake256_8way_close( &ctx_blake, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, @@ -247,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid; void lyra2z_4way_midstate( const void* input ) { blake256_4way_init( &l2z_4way_blake_mid ); - blake256_4way( &l2z_4way_blake_mid, input, 64 ); + blake256_4way_update( &l2z_4way_blake_mid, input, 64 ); } void lyra2z_4way_hash( void *state, const void *input ) @@ -260,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input ) blake256_4way_context ctx_blake __attribute__ ((aligned (64))); memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid ); - blake256_4way( &ctx_blake, input + (64*4), 16 ); + blake256_4way_update( &ctx_blake, input + (64*4), 16 ); blake256_4way_close( &ctx_blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); diff --git a/algo/nist5/nist5-4way.c b/algo/nist5/nist5-4way.c index c4aa73d..9b8687b 100644 --- a/algo/nist5/nist5-4way.c +++ b/algo/nist5/nist5-4way.c @@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input ) keccak512_4way_context ctx_keccak; blake512_4way_init( &ctx_blake ); - blake512_4way( &ctx_blake, input, 80 ); + blake512_4way_update( &ctx_blake, input, 80 ); blake512_4way_close( &ctx_blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); jh512_4way_init( &ctx_jh ); - jh512_4way( &ctx_jh, vhash, 64 ); + jh512_4way_update( &ctx_jh, vhash, 64 ); jh512_4way_close( &ctx_jh, vhash ); keccak512_4way_init( &ctx_keccak ); - keccak512_4way( &ctx_keccak, vhash, 64 ); + keccak512_4way_update( &ctx_keccak, vhash, 64 ); keccak512_4way_close( &ctx_keccak, vhash ); skein512_4way_init( &ctx_skein ); - skein512_4way( &ctx_skein, vhash, 64 ); + skein512_4way_update( &ctx_skein, vhash, 64 ); skein512_4way_close( &ctx_skein, out ); } diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c index 2c5d561..be1c19f 100644 --- a/algo/quark/anime-4way.c +++ b/algo/quark/anime-4way.c @@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input ) anime_4way_ctx_holder ctx; memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); - bmw512_4way( &ctx.bmw, input, 80 ); + bmw512_4way_update( &ctx.bmw, input, 80 ); bmw512_4way_close( &ctx.bmw, vhash ); - blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_update( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input ) if ( mm256_anybits0( vh_mask ) ) { - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhashB ); } @@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input ) if ( mm256_anybits1( vh_mask ) ) { blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_update( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); } if ( mm256_anybits0( vh_mask ) ) { bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input ) if ( mm256_anybits1( vh_mask ) ) { keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); } if ( mm256_anybits0( vh_mask ) ) { jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); } diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 9f22d29..4eac923 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -21,6 +21,11 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" #include "algo/sha/sha-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(HMQ1725_8WAY) @@ -28,21 +33,27 @@ union _hmq1725_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; sha512_8way_context sha512; haval256_5_8way_context haval; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _hmq1725_8way_context_overlay hmq1725_8way_context_overlay; @@ -52,6 +63,7 @@ extern void hmq1725_8way_hash(void *state, const void *input) uint32_t vhash [16<<3] __attribute__ ((aligned (128))); uint32_t vhashA[16<<3] __attribute__ ((aligned (64))); uint32_t vhashB[16<<3] __attribute__ ((aligned (64))); + uint32_t vhashC[16<<3] __attribute__ ((aligned (64))); uint32_t hash0 [16] __attribute__ ((aligned (64))); uint32_t hash1 [16] __attribute__ ((aligned (64))); uint32_t hash2 [16] __attribute__ ((aligned (64))); @@ -67,6 +79,7 @@ extern void hmq1725_8way_hash(void *state, const void *input) __m512i* vh = (__m512i*)vhash; __m512i* vhA = (__m512i*)vhashA; __m512i* vhB = (__m512i*)vhashB; + __m512i* vhC = (__m512i*)vhashC; bmw512_8way_init( &ctx.bmw ); bmw512_8way_update( &ctx.bmw, input, 80 ); @@ -106,6 +119,28 @@ extern void hmq1725_8way_hash(void *state, const void *input) m512_zero ); // A + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) + { + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + } + if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) + { + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + } + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + +#else + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + if ( hash0[0] & mask ) { init_groestl( &ctx.groestl, 64 ); @@ -140,13 +175,13 @@ extern void hmq1725_8way_hash(void *state, const void *input) { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash5, - (char*)hash5, 512 ); + (char*)hash5, 512 ); } if ( hash6[0] & mask ) { init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash6, - (char*)hash6, 512 ); + (char*)hash6, 512 ); } if ( hash7[0] & mask ) { @@ -155,9 +190,11 @@ extern void hmq1725_8way_hash(void *state, const void *input) (char*)hash7, 512 ); } - intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + // B if ( likely( vh_mask & 0xff ) ) { @@ -166,7 +203,7 @@ extern void hmq1725_8way_hash(void *state, const void *input) skein512_8way_close( &ctx.skein, vhashB ); } - mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); @@ -225,6 +262,20 @@ extern void hmq1725_8way_hash(void *state, const void *input) } mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -256,6 +307,8 @@ extern void hmq1725_8way_hash(void *state, const void *input) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); @@ -334,6 +387,20 @@ extern void hmq1725_8way_hash(void *state, const void *input) } mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -365,17 +432,38 @@ extern void hmq1725_8way_hash(void *state, const void *input) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + blake512_8way_init( &ctx.blake ); blake512_8way_update( &ctx.blake, vhash, 64 ); blake512_8way_close( &ctx.blake, vhash ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); // A +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) + { + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + } + if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) + { + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + } + + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + +#else + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhash ); + if ( hash0[0] & mask ) { sph_shavite512_init( &ctx.shavite ); @@ -425,19 +513,28 @@ extern void hmq1725_8way_hash(void *state, const void *input) sph_shavite512_close( &ctx.shavite, hash7 ); //8 } + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + +#endif + // B - if ( likely( vh_mask & 0xff ) ) + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( likely( vh_mask & 0x0f ) ) { luffa_4way_init( &ctx.luffa, 512 ); luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + } + if ( likely( vh_mask & 0xf0 ) ) + { luffa_4way_init( &ctx.luffa, 512 ); luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 ); - rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); } - intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); - mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); + + mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); @@ -475,8 +572,27 @@ extern void hmq1725_8way_hash(void *state, const void *input) hash7 ); vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ), m512_zero ); - rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + // A +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( likely( ( vh_mask & 0x0f ) != 0x0f ) ) + { + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + } + if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) ) + { + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + } + + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + +#else + if ( hash0[0] & mask ) //4 { init_echo( &ctx.echo, 512 ); @@ -526,19 +642,29 @@ extern void hmq1725_8way_hash(void *state, const void *input) (const BitSequence *)hash7, 512 ); } + intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7 ); + +#endif + // B - if ( likely( vh_mask & 0xff ) ) + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( likely( vh_mask & 0x0f ) ) { simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + } + if ( likely( vh_mask & 0xf0 ) ) + { simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 ); - rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); } - intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7 ); - mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 ); + + mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); + rintrlv_8x64_8x32( vhashA, vhash, 512 ); shabal512_8way_init( &ctx.shabal ); @@ -641,6 +767,20 @@ extern void hmq1725_8way_hash(void *state, const void *input) } mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -664,6 +804,8 @@ extern void hmq1725_8way_hash(void *state, const void *input) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + sha512_8way_init( &ctx.sha512 ); sha512_8way_update( &ctx.sha512, vhash, 64 ); sha512_8way_close( &ctx.sha512, vhash ); @@ -830,7 +972,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) __m256i* vhB = (__m256i*)vhashB; bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, input, 80 ); + bmw512_4way_update( &ctx.bmw, input, 80 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -889,18 +1031,18 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits1( vh_mask ) ) { skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // second fork, A = blake parallel, B= bmw parallel. @@ -911,14 +1053,14 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits0( vh_mask ) ) { blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_update( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); } if ( mm256_anybits1( vh_mask ) ) { bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); } @@ -962,14 +1104,14 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits0( vh_mask ) ) { keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); } if ( mm256_anybits1( vh_mask ) ) { jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); } @@ -990,7 +1132,6 @@ extern void hmq1725_4way_hash(void *state, const void *input) sph_shavite512 ( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - intrlv_2x128_512( vhashA, hash0, hash1 ); intrlv_2x128_512( vhashB, hash2, hash3 ); @@ -1042,7 +1183,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits1( vh_mask ) ) { haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); + haval256_5_4way_update( &ctx.haval, vhash, 64 ); haval256_5_4way_close( &ctx.haval, vhash ); memset( &vhash[8<<2], 0, 32<<2 ); rintrlv_4x32_4x64( vhashB, vhash, 512 ); @@ -1068,7 +1209,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_update( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -1130,7 +1271,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -1214,7 +1355,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -1269,7 +1410,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits1( vh_mask ) ) { sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhashB ); } @@ -1289,7 +1430,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); // A = haval parallel, B = Whirlpool serial @@ -1305,7 +1446,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( mm256_anybits0( vh_mask ) ) { haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); + haval256_5_4way_update( &ctx.haval, vhash, 64 ); haval256_5_4way_close( &ctx.haval, vhash ); memset( &vhash[8<<2], 0, 32<<2 ); rintrlv_4x32_4x64( vhashA, vhash, 512 ); @@ -1341,7 +1482,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); memcpy(state, vhash, 32<<2 ); diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c index 9cc2784..4c64bff 100644 --- a/algo/quark/hmq1725-gate.c +++ b/algo/quark/hmq1725-gate.c @@ -13,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_hmq1725; gate->hash = (void*)&hmq1725hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 65536.0; return true; }; diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 180d636..3181866 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -9,16 +9,23 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" +#endif #if defined (QUARK_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; jh512_8way_context jh; skein512_8way_context skein; keccak512_8way_context keccak; +#if defined(__VAES__) + groestl512_4way_context groestl; +#else + hashState_groestl groestl; +#endif } quark_8way_ctx_holder; quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128))); @@ -27,10 +34,14 @@ void init_quark_8way_ctx() { blake512_8way_init( &quark_8way_ctx.blake ); bmw512_8way_init( &quark_8way_ctx.bmw ); - init_groestl( &quark_8way_ctx.groestl, 64 ); skein512_8way_init( &quark_8way_ctx.skein ); jh512_8way_init( &quark_8way_ctx.jh ); keccak512_8way_init( &quark_8way_ctx.keccak ); +#if defined(__VAES__) + groestl512_4way_init( &quark_8way_ctx.groestl, 64 ); +#else + init_groestl( &quark_8way_ctx.groestl, 64 ); +#endif } void quark_8way_hash( void *state, const void *input ) @@ -38,6 +49,7 @@ void quark_8way_hash( void *state, const void *input ) uint64_t vhash[8*8] __attribute__ ((aligned (128))); uint64_t vhashA[8*8] __attribute__ ((aligned (64))); uint64_t vhashB[8*8] __attribute__ ((aligned (64))); + uint64_t vhashC[8*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -49,6 +61,7 @@ void quark_8way_hash( void *state, const void *input ) __m512i* vh = (__m512i*)vhash; __m512i* vhA = (__m512i*)vhashA; __m512i* vhB = (__m512i*)vhashB; + __m512i* vhC = (__m512i*)vhashC; __mmask8 vh_mask; quark_8way_ctx_holder ctx; const uint32_t mask = 8; @@ -66,6 +79,25 @@ void quark_8way_hash( void *state, const void *input ) vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), zero ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + if ( ( vh_mask & 0x0f ) != 0x0f ) + { + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + } + if ( ( vh_mask & 0xf0 ) != 0xf0 ) + { + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + } + rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); @@ -117,16 +149,31 @@ void quark_8way_hash( void *state, const void *input ) (char*)hash7, 512 ); } - intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 512 ); +#endif + if ( vh_mask & 0xff ) { skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhashB ); } - mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); + mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); @@ -151,6 +198,8 @@ void quark_8way_hash( void *state, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 512 ); +#endif + jh512_8way_update( &ctx.jh, vhash, 64 ); jh512_8way_close( &ctx.jh, vhash ); @@ -289,10 +338,10 @@ void quark_4way_hash( void *state, const void *input ) memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -327,7 +376,7 @@ void quark_4way_hash( void *state, const void *input ) if ( mm256_anybits1( vh_mask ) ) { - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhashB ); } @@ -346,7 +395,7 @@ void quark_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -354,24 +403,24 @@ void quark_4way_hash( void *state, const void *input ) if ( mm256_anybits0( vh_mask ) ) { blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_update( &ctx.blake, vhash, 64 ); blake512_4way_close( &ctx.blake, vhashA ); } if ( mm256_anybits1( vh_mask ) ) { bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero ); @@ -379,14 +428,14 @@ void quark_4way_hash( void *state, const void *input ) if ( mm256_anybits0( vh_mask ) ) { keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhashA ); } if ( mm256_anybits1( vh_mask ) ) { jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhashB ); } diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c index ee4842f..0c26473 100644 --- a/algo/quark/quark-gate.c +++ b/algo/quark/quark-gate.c @@ -15,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_quark; gate->hash = (void*)&quark_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c index 2b5d603..630c1ee 100644 --- a/algo/qubit/qubit-2way.c +++ b/algo/qubit/qubit-2way.c @@ -9,6 +9,10 @@ #include "algo/simd/simd-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(QUBIT_4WAY) @@ -16,10 +20,14 @@ typedef struct { luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - simd_2way_context simd2; +#if defined(__VAES__) + shavite512_4way_context shavite; + echo_4way_context echo; +#else + sph_shavite512_context shavite; hashState_echo echo; +#endif } qubit_4way_ctx_holder; qubit_4way_ctx_holder qubit_4way_ctx; @@ -27,10 +35,14 @@ qubit_4way_ctx_holder qubit_4way_ctx; void init_qubit_4way_ctx() { cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init(&qubit_4way_ctx.shavite); simd_4way_init( &qubit_4way_ctx.simd, 512 ); - simd_2way_init( &qubit_4way_ctx.simd2, 512 ); - init_echo(&qubit_4way_ctx.echo, 512); +#if defined(__VAES__) + shavite512_4way_init( &qubit_4way_ctx.shavite ); + echo_4way_init( &qubit_4way_ctx.echo, 512 ); +#else + sph_shavite512_init( &qubit_4way_ctx.shavite ); + init_echo( &qubit_4way_ctx.echo, 512 ); +#endif }; void qubit_4way_hash( void *output, const void *input ) @@ -48,6 +60,13 @@ void qubit_4way_hash( void *output, const void *input ) luffa_4way_close( &ctx.luffa, vhash ); cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); + +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); sph_shavite512( &ctx.shavite, hash0, 64 ); @@ -66,31 +85,44 @@ void qubit_4way_hash( void *output, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + +#endif + simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + + dintrlv_4x128( output, output+32, output+64, output+96, vhash, 256 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); + update_final_echo( &ctx.echo, (BitSequence*)hash0, + (const BitSequence*)hash0, 512 ); memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); + update_final_echo( &ctx.echo, (BitSequence*)hash1, + (const BitSequence*)hash1, 512 ); memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); + update_final_echo( &ctx.echo, (BitSequence*)hash2, + (const BitSequence*)hash2, 512 ); memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); + update_final_echo( &ctx.echo, (BitSequence*)hash3, + (const BitSequence*)hash3, 512 ); memcpy( output, hash0, 32 ); memcpy( output+32, hash1, 32 ); memcpy( output+64, hash2, 32 ); memcpy( output+96, hash3, 32 ); +#endif } int scanhash_qubit_4way( struct work *work,uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*8] __attribute__ ((aligned (128))); + uint32_t hash[8*4] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; diff --git a/algo/qubit/qubit-gate.c b/algo/qubit/qubit-gate.c index b3592a5..0d547c8 100644 --- a/algo/qubit/qubit-gate.c +++ b/algo/qubit/qubit-gate.c @@ -16,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_qubit; gate->hash = (void*)&qubit_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index 78a6f5a..0228c86 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -7,7 +7,7 @@ #include "ripemd-hash-4way.h" #define LBRY_INPUT_SIZE 112 -#define LBRY_MIDSTATE 64 +#define LBRY_MIDSTATE 96 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE) #if defined(LBRY_16WAY) @@ -35,9 +35,9 @@ void lbry_16way_hash( void* output, const void* input ) uint32_t _ALIGN(64) h13[32]; uint32_t _ALIGN(64) h14[32]; uint32_t _ALIGN(64) h15[32]; - sha256_16way_context ctx_sha256 __attribute__ ((aligned (64))); + sha256_16way_context ctx_sha256 __attribute__ ((aligned (64))); sha512_8way_context ctx_sha512; - ripemd160_16way_context ctx_ripemd; + ripemd160_16way_context ctx_ripemd; memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) ); sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL ); @@ -62,7 +62,7 @@ void lbry_16way_hash( void* output, const void* input ) sha512_8way_close( &ctx_sha512, vhashB ); // back to 8-way 32 bit - dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 ); + dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 ); dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 ); intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, h14, h15, 512 ); @@ -90,14 +90,15 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, { uint32_t hash[8*16] __attribute__ ((aligned (128))); uint32_t vdata[32*16] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t edata[32] __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[7<<4]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[27]; const uint32_t first_nonce = pdata[27]; + const uint32_t last_nonce = max_nonce - 16; const uint32_t Htarg = ptarget[7]; - uint32_t edata[32] __attribute__ ((aligned (64))); __m512i *noncev = (__m512i*)vdata + 27; // aligned int thr_id = mythr->id; // thr_id arg is deprecated @@ -114,14 +115,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 ); sha256_16way_init( &sha256_16w_mid ); - sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE ); + sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE ); do { - *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12, - n+11, n+10, n+ 9, n+ 8, - n+ 7, n+ 6, n+ 5, n+ 4, - n+ 3, n+ 2, n+ 1, n ) ); + *noncev = mm512_bswap_32( _mm512_set_epi32( + n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, + n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) ); lbry_16way_hash( hash, vdata ); for ( int i = 0; i < 16; i++ ) @@ -129,27 +129,25 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, { // deinterleave hash for lane extr_lane_16x32( lane_hash, hash, i, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) { pdata[27] = n + i; submit_lane_solution( work, lane_hash, mythr, i ); } } n += 16; - } while ( (n < max_nonce-16) && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce + 1; + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + *hashes_done = n - first_nonce; return 0; } - - #elif defined(LBRY_8WAY) static __thread sha256_8way_context sha256_8w_mid; void lbry_8way_hash( void* output, const void* input ) { - uint32_t _ALIGN(64) vhashA[16<<3]; + uint32_t _ALIGN(128) vhashA[16<<3]; uint32_t _ALIGN(64) vhashB[16<<3]; uint32_t _ALIGN(64) vhashC[16<<3]; uint32_t _ALIGN(32) h0[32]; @@ -165,11 +163,11 @@ void lbry_8way_hash( void* output, const void* input ) ripemd160_8way_context ctx_ripemd; memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) ); - sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL ); + sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL ); sha256_8way_close( &ctx_sha256, vhashA ); sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashA, 32 ); + sha256_8way_update( &ctx_sha256, vhashA, 32 ); sha256_8way_close( &ctx_sha256, vhashA ); // reinterleave to do sha512 4-way 64 bit twice. @@ -178,11 +176,11 @@ void lbry_8way_hash( void* output, const void* input ) intrlv_4x64( vhashB, h4, h5, h6, h7, 256 ); sha512_4way_init( &ctx_sha512 ); - sha512_4way( &ctx_sha512, vhashA, 32 ); + sha512_4way_update( &ctx_sha512, vhashA, 32 ); sha512_4way_close( &ctx_sha512, vhashA ); sha512_4way_init( &ctx_sha512 ); - sha512_4way( &ctx_sha512, vhashB, 32 ); + sha512_4way_update( &ctx_sha512, vhashB, 32 ); sha512_4way_close( &ctx_sha512, vhashB ); // back to 8-way 32 bit @@ -191,20 +189,20 @@ void lbry_8way_hash( void* output, const void* input ) intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 ); ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way( &ctx_ripemd, vhashA, 32 ); + ripemd160_8way_update( &ctx_ripemd, vhashA, 32 ); ripemd160_8way_close( &ctx_ripemd, vhashB ); ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 ); + ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 ); ripemd160_8way_close( &ctx_ripemd, vhashC ); sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashB, 20 ); - sha256_8way( &ctx_sha256, vhashC, 20 ); + sha256_8way_update( &ctx_sha256, vhashB, 20 ); + sha256_8way_update( &ctx_sha256, vhashC, 20 ); sha256_8way_close( &ctx_sha256, vhashA ); sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhashA, 32 ); + sha256_8way_update( &ctx_sha256, vhashA, 32 ); sha256_8way_close( &ctx_sha256, output ); } @@ -214,13 +212,13 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, uint32_t hash[8*8] __attribute__ ((aligned (64))); uint32_t vdata[32*8] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t edata[32] __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[7<<3]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; uint32_t n = pdata[27]; const uint32_t first_nonce = pdata[27]; const uint32_t Htarg = ptarget[7]; - uint32_t edata[32] __attribute__ ((aligned (64))); __m256i *noncev = (__m256i*)vdata + 27; // aligned int thr_id = mythr->id; // thr_id arg is deprecated @@ -237,7 +235,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, edata, edata, edata, edata, 1024 ); sha256_8way_init( &sha256_8w_mid ); - sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); + sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); do { diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c index ac94c64..f4080a8 100644 --- a/algo/ripemd/lbry-gate.c +++ b/algo/ripemd/lbry-gate.c @@ -98,7 +98,7 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; } bool register_lbry_algo( algo_gate_t* gate ) { - gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; +// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; #if defined (LBRY_16WAY) gate->scanhash = (void*)&scanhash_lbry_16way; gate->hash = (void*)&lbry_16way_hash; diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h index 603b5b5..2aedd6b 100644 --- a/algo/ripemd/lbry-gate.h +++ b/algo/ripemd/lbry-gate.h @@ -5,11 +5,10 @@ #include -// 16 way needs sha256 16 way -//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// #define LBRY_16WAY -#if defined(__AVX2__) - #define LBRY_8WAY +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define LBRY_16WAY 1 +#elif defined(__AVX2__) + #define LBRY_8WAY 1 #endif /* #if !defined(__SHA__) @@ -37,13 +36,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, void lbry_8way_hash( void *state, const void *input ); int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -/* + #elif defined(LBRY_4WAY) void lbry_4way_hash( void *state, const void *input ); int scanhash_lbry_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done ); -*/ + #else void lbry_hash( void *state, const void *input ); diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c index 42c0d2d..38de159 100644 --- a/algo/ripemd/ripemd-hash-4way.c +++ b/algo/ripemd/ripemd-hash-4way.c @@ -259,7 +259,8 @@ void ripemd160_4way_init( ripemd160_4way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len ) +void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, + size_t len ) { __m128i *vdata = (__m128i*)data; size_t ptr; @@ -559,7 +560,8 @@ void ripemd160_8way_init( ripemd160_8way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len ) +void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, + size_t len ) { __m256i *vdata = (__m256i*)data; size_t ptr; @@ -859,7 +861,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_16way( ripemd160_16way_context *sc, const void *data, +void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h index c565ad7..71fb3d7 100644 --- a/algo/ripemd/ripemd-hash-4way.h +++ b/algo/ripemd/ripemd-hash-4way.h @@ -16,7 +16,8 @@ typedef struct } __attribute__ ((aligned (64))) ripemd160_4way_context; void ripemd160_4way_init( ripemd160_4way_context *sc ); -void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len ); +void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, + size_t len ); void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ); #if defined (__AVX2__) @@ -26,10 +27,11 @@ typedef struct __m256i buf[64>>2]; __m256i val[5]; uint32_t count_high, count_low; -} __attribute__ ((aligned (64))) ripemd160_8way_context; +} __attribute__ ((aligned (128))) ripemd160_8way_context; void ripemd160_8way_init( ripemd160_8way_context *sc ); -void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len ); +void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, + size_t len ); void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -42,7 +44,7 @@ typedef struct } __attribute__ ((aligned (128))) ripemd160_16way_context; void ripemd160_16way_init( ripemd160_16way_context *sc ); -void ripemd160_16way( ripemd160_16way_context *sc, const void *data, +void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, size_t len ); void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst ); diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 2ac2a7e..3635dd9 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -41,13 +41,9 @@ #define SHA2_HASH_4WAY_H__ 1 #include -#include "sph_types.h" #include "simd-utils.h" #if defined(__SSE2__) -//#if defined(__SSE4_2__) - -//#define SPH_SIZE_sha256 256 // SHA-256 4 way @@ -59,9 +55,12 @@ typedef struct { } sha256_4way_context __attribute__ ((aligned (64))); void sha256_4way_init( sha256_4way_context *sc ); -void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ); +void sha256_4way_update( sha256_4way_context *sc, const void *data, + size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); +#endif // SSE2 + #if defined (__AVX2__) // SHA-256 8 way @@ -75,10 +74,28 @@ typedef struct { void sha256_8way_init( sha256_8way_context *sc ); void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ); -#define sha256_8way sha256_8way_update void sha256_8way_close( sha256_8way_context *sc, void *dst ); -//#define SPH_SIZE_sha512 512 +#endif // AVX2 + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-256 16 way + +typedef struct { + __m512i buf[64>>2]; + __m512i val[8]; + uint32_t count_high, count_low; + bool initialized; +} sha256_16way_context __attribute__ ((aligned (128))); + +void sha256_16way_init( sha256_16way_context *sc ); +void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ); +void sha256_16way_close( sha256_16way_context *sc, void *dst ); + +#endif // AVX512 + +#if defined (__AVX2__) // SHA-512 4 way @@ -92,9 +109,10 @@ typedef struct { void sha512_4way_init( sha512_4way_context *sc); void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len ); -#define sha512_4way sha512_4way_update void sha512_4way_close( sha512_4way_context *sc, void *dst ); +#endif // AVX2 + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // SHA-512 8 way @@ -111,8 +129,6 @@ void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len ); void sha512_8way_close( sha512_8way_context *sc, void *dst ); - #endif // AVX512 -#endif // __AVX2__ -#endif // __SSE2__ + #endif // SHA256_4WAY_H__ diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index ba6b952..2167407 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -39,47 +39,31 @@ // SHA-256 32 bit /* -static const sph_u32 H256[8] = { - SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), - SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), - SPH_C32(0x510E527F), SPH_C32(0x9B05688C), - SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +static const uint32_t H256[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; */ -static const sph_u32 K256[64] = { - SPH_C32(0x428A2F98), SPH_C32(0x71374491), - SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), - SPH_C32(0x3956C25B), SPH_C32(0x59F111F1), - SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5), - SPH_C32(0xD807AA98), SPH_C32(0x12835B01), - SPH_C32(0x243185BE), SPH_C32(0x550C7DC3), - SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE), - SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174), - SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786), - SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC), - SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA), - SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA), - SPH_C32(0x983E5152), SPH_C32(0xA831C66D), - SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7), - SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147), - SPH_C32(0x06CA6351), SPH_C32(0x14292967), - SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138), - SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13), - SPH_C32(0x650A7354), SPH_C32(0x766A0ABB), - SPH_C32(0x81C2C92E), SPH_C32(0x92722C85), - SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B), - SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3), - SPH_C32(0xD192E819), SPH_C32(0xD6990624), - SPH_C32(0xF40E3585), SPH_C32(0x106AA070), - SPH_C32(0x19A4C116), SPH_C32(0x1E376C08), - SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5), - SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A), - SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3), - SPH_C32(0x748F82EE), SPH_C32(0x78A5636F), - SPH_C32(0x84C87814), SPH_C32(0x8CC70208), - SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), - SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) +static const uint32_t K256[64] = +{ + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; // SHA-256 4 way @@ -248,7 +232,7 @@ void sha256_4way_init( sha256_4way_context *sc ) */ } -void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ) +void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) { __m128i *vdata = (__m128i*)data; size_t ptr; @@ -273,7 +257,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ) ptr = 0; } clow = sc->count_low; - clow2 = SPH_T32( clow + clen ); + clow2 = clow + clen; sc->count_low = clow2; if ( clow2 < clow ) sc->count_high++; @@ -306,10 +290,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = mm128_bswap_32( m128_const1_32( high ) ); -// mm128_bswap_32( _mm_set1_epi32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = mm128_bswap_32( m128_const1_32( low ) ); -// mm128_bswap_32( _mm_set1_epi32( low ) ); sha256_4way_round( sc, sc->buf, sc->val ); mm128_block_bswap_32( dst, sc->val ); @@ -483,7 +465,7 @@ void sha256_8way_init( sha256_8way_context *sc ) */ } -void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ) +void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; size_t ptr; @@ -508,7 +490,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ) ptr = 0; } clow = sc->count_low; - clow2 = SPH_T32( clow + clen ); + clow2 = clow + clen; sc->count_low = clow2; if ( clow2 < clow ) sc->count_high++; @@ -549,5 +531,233 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) mm256_block_bswap_32( dst, sc->val ); } + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +// SHA-256 16 way + +#define CHx16(X, Y, Z) \ + _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) + +#define MAJx16(X, Y, Z) \ + _mm512_or_si512( _mm512_and_si512( X, Y ), \ + _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) ) + +#define BSG2_0x16(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) ) + +#define BSG2_1x16(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) ) + +#define SSG2_0x16(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) + +#define SSG2_1x16(x) \ + _mm512_xor_si512( _mm512_xor_si512( \ + mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) ) + +#define SHA2x16_MEXP( a, b, c, d ) \ + mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); + +#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ +do { \ + __m512i T1, T2; \ + __m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \ + T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \ + K, W[i] ) ); \ + T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ + D = _mm512_add_epi32( D, T1 ); \ + H = _mm512_add_epi32( T1, T2 ); \ +} while (0) + +static void +sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) +{ + register __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; + + mm512_block_bswap_32( W , in ); + mm512_block_bswap_32( W+8, in+8 ); + + if ( ctx->initialized ) + { + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + } + else + { + A = m512_const1_64( 0x6A09E6676A09E667 ); + B = m512_const1_64( 0xBB67AE85BB67AE85 ); + C = m512_const1_64( 0x3C6EF3723C6EF372 ); + D = m512_const1_64( 0xA54FF53AA54FF53A ); + E = m512_const1_64( 0x510E527F510E527F ); + F = m512_const1_64( 0x9B05688C9B05688C ); + G = m512_const1_64( 0x1F83D9AB1F83D9AB ); + H = m512_const1_64( 0x5BE0CD195BE0CD19 ); + } + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + for ( int j = 16; j < 64; j += 16 ) + { + W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); + W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); + W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); + W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); + W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); + W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); + W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); + W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); + W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); + W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); + W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); + W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); + W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); + W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); + W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); + W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); + + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + } + + if ( ctx->initialized ) + { + r[0] = _mm512_add_epi32( r[0], A ); + r[1] = _mm512_add_epi32( r[1], B ); + r[2] = _mm512_add_epi32( r[2], C ); + r[3] = _mm512_add_epi32( r[3], D ); + r[4] = _mm512_add_epi32( r[4], E ); + r[5] = _mm512_add_epi32( r[5], F ); + r[6] = _mm512_add_epi32( r[6], G ); + r[7] = _mm512_add_epi32( r[7], H ); + } + else + { + ctx->initialized = true; + r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) ); + r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) ); + r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) ); + r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) ); + r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) ); + r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) ); + r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) ); + r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) ); + } +} + +void sha256_16way_init( sha256_16way_context *sc ) +{ + sc->initialized = false; + sc->count_high = sc->count_low = 0; +} + + +void sha256_16way_update( sha256_16way_context *sc, const void *data, + size_t len ) +{ + __m512i *vdata = (__m512i*)data; + size_t ptr; + const int buf_size = 64; + + ptr = (unsigned)sc->count_low & (buf_size - 1U); + while ( len > 0 ) + { + size_t clen; + uint32_t clow, clow2; + + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 ); + vdata = vdata + (clen>>2); + ptr += clen; + len -= clen; + if ( ptr == buf_size ) + { + sha256_16way_round( sc, sc->buf, sc->val ); + ptr = 0; + } + clow = sc->count_low; + clow2 = clow + clen; + sc->count_low = clow2; + if ( clow2 < clow ) + sc->count_high++; + } +} + +void sha256_16way_close( sha256_16way_context *sc, void *dst ) +{ + unsigned ptr; + uint32_t low, high; + const int buf_size = 64; + const int pad = buf_size - 8; + + ptr = (unsigned)sc->count_low & (buf_size - 1U); + sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 ); + ptr += 4; + + if ( ptr > pad ) + { + memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); + sha256_16way_round( sc, sc->buf, sc->val ); + memset_zero_512( sc->buf, pad >> 2 ); + } + else + memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); + + low = sc->count_low; + high = (sc->count_high << 3) | (low >> 29); + low = low << 3; + + sc->buf[ pad >> 2 ] = + mm512_bswap_32( m512_const1_32( high ) ); + sc->buf[ ( pad+4 ) >> 2 ] = + mm512_bswap_32( m512_const1_32( low ) ); + + sha256_16way_round( sc, sc->buf, sc->val ); + + mm512_block_bswap_32( dst, sc->val ); +} + +#endif // AVX512 #endif // __AVX2__ #endif // __SSE2__ diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c index 41c3458..2cecfcc 100644 --- a/algo/sha/sha256q-4way.c +++ b/algo/sha/sha256q-4way.c @@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input ) sha256_8way_context ctx; memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - sha256_8way( &ctx, input + (64<<3), 16 ); + sha256_8way_update( &ctx, input + (64<<3), 16 ); sha256_8way_close( &ctx, vhash ); sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); + sha256_8way_update( &ctx, vhash, 32 ); sha256_8way_close( &ctx, vhash ); sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); + sha256_8way_update( &ctx, vhash, 32 ); sha256_8way_close( &ctx, vhash ); sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); + sha256_8way_update( &ctx, vhash, 32 ); sha256_8way_close( &ctx, output ); } @@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce, // Need big endian data mm256_bswap32_intrlv80_8x32( vdata, pdata ); sha256_8way_init( &sha256_ctx8 ); - sha256_8way( &sha256_ctx8, vdata, 64 ); + sha256_8way_update( &sha256_ctx8, vdata, 64 ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { @@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input ) sha256_4way_context ctx; memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - sha256_4way( &ctx, input + (64<<2), 16 ); + sha256_4way_update( &ctx, input + (64<<2), 16 ); sha256_4way_close( &ctx, vhash ); sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); + sha256_4way_update( &ctx, vhash, 32 ); sha256_4way_close( &ctx, vhash ); sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); + sha256_4way_update( &ctx, vhash, 32 ); sha256_4way_close( &ctx, vhash ); sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); + sha256_4way_update( &ctx, vhash, 32 ); sha256_4way_close( &ctx, output ); } @@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); sha256_4way_init( &sha256_ctx4 ); - sha256_4way( &sha256_ctx4, vdata, 64 ); + sha256_4way_update( &sha256_ctx4, vdata, 64 ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 5c4dd68..b48633b 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input ) sha256_8way_context ctx; memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - sha256_8way( &ctx, input + (64<<3), 16 ); + sha256_8way_update( &ctx, input + (64<<3), 16 ); sha256_8way_close( &ctx, vhash ); sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); + sha256_8way_update( &ctx, vhash, 32 ); sha256_8way_close( &ctx, vhash ); sha256_8way_init( &ctx ); - sha256_8way( &ctx, vhash, 32 ); + sha256_8way_update( &ctx, vhash, 32 ); sha256_8way_close( &ctx, output ); } @@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, // Need big endian data mm256_bswap32_intrlv80_8x32( vdata, pdata ); sha256_8way_init( &sha256_ctx8 ); - sha256_8way( &sha256_ctx8, vdata, 64 ); + sha256_8way_update( &sha256_ctx8, vdata, 64 ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { @@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input ) sha256_4way_context ctx; memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - sha256_4way( &ctx, input + (64<<2), 16 ); + sha256_4way_update( &ctx, input + (64<<2), 16 ); sha256_4way_close( &ctx, vhash ); sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); + sha256_4way_update( &ctx, vhash, 32 ); sha256_4way_close( &ctx, vhash ); sha256_4way_init( &ctx ); - sha256_4way( &ctx, vhash, 32 ); + sha256_4way_update( &ctx, vhash, 32 ); sha256_4way_close( &ctx, output ); } @@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); sha256_4way_init( &sha256_ctx4 ); - sha256_4way( &sha256_ctx4, vdata, 64 ); + sha256_4way_update( &sha256_ctx4, vdata, 64 ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 3ee8194..d056da0 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -37,55 +37,57 @@ #include "sha-hash-4way.h" /* -static const sph_u64 H512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +static const uit64_t H512[8] = +{ + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; */ -static const sph_u64 K512[80] = { - SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), - SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), - SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), - SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), - SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), - SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), - SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), - SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), - SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), - SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), - SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), - SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), - SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), - SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), - SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), - SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), - SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), - SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), - SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), - SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), - SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), - SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), - SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), - SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), - SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), - SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), - SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), - SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), - SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), - SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), - SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), - SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), - SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), - SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), - SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), - SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), - SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), - SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), - SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), - SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) +static const uint64_t K512[80] = +{ + 0x428A2F98D728AE22, 0x7137449123EF65CD, + 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, + 0x3956C25BF348B538, 0x59F111F1B605D019, + 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, + 0xD807AA98A3030242, 0x12835B0145706FBE, + 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, + 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, + 0x9BDC06A725C71235, 0xC19BF174CF692694, + 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, + 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, + 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, + 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, + 0x983E5152EE66DFAB, 0xA831C66D2DB43210, + 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, + 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, + 0x06CA6351E003826F, 0x142929670A0E6E70, + 0x27B70A8546D22FFC, 0x2E1B21385C26C926, + 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, + 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, + 0x81C2C92E47EDAEE6, 0x92722C851482353B, + 0xA2BFE8A14CF10364, 0xA81A664BBC423001, + 0xC24B8B70D0F89791, 0xC76C51A30654BE30, + 0xD192E819D6EF5218, 0xD69906245565A910, + 0xF40E35855771202A, 0x106AA07032BBD1B8, + 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, + 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, + 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, + 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, + 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, + 0x84C87814A1F0AB72, 0x8CC702081A6439EC, + 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, + 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, + 0xCA273ECEEA26619C, 0xD186B8C721C0C207, + 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, + 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, + 0x113F9804BEF90DAE, 0x1B710B35131C471B, + 0x28DB77F523047D84, 0x32CAAB7B40C72493, + 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, + 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, + 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 }; diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index c296f8c..0efec0b 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -97,7 +97,7 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void shabal512_4way_init( void *cc ); void shabal512_4way_update( void *cc, const void *data, size_t len ); -#define shabal512_4way shabal512_4way_update +//#define shabal512_4way shabal512_4way_update void shabal512_4way_close( void *cc, void *dst ); void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c new file mode 100644 index 0000000..a8b439c --- /dev/null +++ b/algo/shavite/shavite-hash-4way.c @@ -0,0 +1,399 @@ +#include "shavite-hash-4way.h" +#include + +static const uint32_t IV512[] = +{ + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A +}; + +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#define mm512_ror2x512hi_1x32( a, b ) \ + _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \ + mm512_ror128_32( b ) ) + +static void +c512_4way( shavite512_4way_context *ctx, const void *msg ) +{ + register __m512i X; + register __m512i P0, P1, P2, P3; + register __m512i K0, K1, K2, K3, K4, K5, K6, K7; + __m512i *M = (__m512i*)msg; + __m512i *H = (__m512i*)ctx->h; + int r; + + P0 = H[0]; + P1 = H[1]; + P2 = H[2]; + P3 = H[3]; + + K0 = M[0]; + K1 = M[1]; + K2 = M[2]; + K3 = M[3]; + K4 = M[4]; + K5 = M[5]; + K6 = M[6]; + K7 = M[7]; + + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P0 = _mm512_xor_si512( P0, X ); + + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + + P2 = _mm512_xor_si512( P2, X ); + + // round + for ( r = 0; r < 3; r ++ ) + { + // round 1, 5, 9 + + K0 = _mm512_xor_si512( K7, mm512_ror128_32( + _mm512_aesenc_epi128( K0, m512_zero ) ) ); + + if ( r == 0 ) + K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( + ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) ); + + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); + K1 = _mm512_xor_si512( K0, + mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); + + if ( r == 1 ) + K1 = _mm512_xor_si512( K1, _mm512_set4_epi32( + ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); + + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + K2 = _mm512_xor_si512( K1, + mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + K3 = _mm512_xor_si512( K2, + mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P3 = _mm512_xor_si512( P3, X ); + + K4 = _mm512_xor_si512( K3, + mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); + K5 = _mm512_xor_si512( K4, + mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + K6 = _mm512_xor_si512( K5, + mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + K7 = _mm512_xor_si512( K6, + mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); + + if ( r == 2 ) + K7 = _mm512_xor_si512( K7, _mm512_set4_epi32( + ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) ); + + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + P1 = _mm512_xor_si512( P1, X ); + + // round 2, 6, 10 + + K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero ); + K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P2 = _mm512_xor_si512( P2, X ); + + K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero ); + K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + + P0 = _mm512_xor_si512( P0, X ); + + // round 3, 7, 11 + + K0 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero ); + K1 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + K2 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + K3 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P1 = _mm512_xor_si512( P1, X ); + + K4 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero ); + K5 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + K6 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K6, m512_zero ) ), K5 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + K7 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + + P3 = _mm512_xor_si512( P3, X ); + + // round 4, 8, 12 + + K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); + K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P0 = _mm512_xor_si512( P0, X ); + + K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); + K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + + P2 = _mm512_xor_si512( P2, X ); + } + + // round 13 + + K0 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); + K1 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + K2 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + K3 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + + P3 = _mm512_xor_si512( P3, X ); + + K4 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); + K5 = _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + + K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); + K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32( + ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) ); + + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + K7= _mm512_xor_si512( mm512_ror128_32( + _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + + P1 = _mm512_xor_si512( P1, X ); + + H[0] = _mm512_xor_si512( H[0], P2 ); + H[1] = _mm512_xor_si512( H[1], P3 ); + H[2] = _mm512_xor_si512( H[2], P0 ); + H[3] = _mm512_xor_si512( H[3], P1 ); +} + +void shavite512_4way_init( shavite512_4way_context *ctx ) +{ + __m512i *h = (__m512i*)ctx->h; + __m128i *iv = (__m128i*)IV512; + + h[0] = m512_const1_128( iv[0] ); + h[1] = m512_const1_128( iv[1] ); + h[2] = m512_const1_128( iv[2] ); + h[3] = m512_const1_128( iv[3] ); + + ctx->ptr = 0; + ctx->count0 = 0; + ctx->count1 = 0; + ctx->count2 = 0; + ctx->count3 = 0; +} + +// not tested, use update_close +void shavite512_4way_update( shavite512_4way_context *ctx, const void *data, + size_t len ) +{ + unsigned char *buf = ctx->buf; + size_t ptr = ctx->ptr; + + while ( len > 0 ) + { + size_t clen; + + clen = (sizeof ctx->buf) - ptr; + if ( clen > len << 2 ) + clen = len << 2; + memcpy( buf + ptr, data, clen ); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen >> 2; + if ( ptr == sizeof ctx->buf ) + { + if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) + { + ctx->count1 = ctx->count1 + 1; + if ( ctx->count1 == 0 ) + { + ctx->count2 = ctx->count2 + 1; + if ( ctx->count2 == 0 ) + ctx->count3 = ctx->count3 + 1; + } + } + c512_4way( ctx, buf ); + ptr = 0; + } + } + ctx->ptr = ptr; +} + +// not tested +void shavite512_4way_close( shavite512_4way_context *ctx, void *dst ) +{ + unsigned char *buf; + union + { + uint32_t u32[4]; + uint16_t u16[8]; + } count; + + buf = ctx->buf; + uint32_t vp = ctx->ptr>>6; + + // Terminating byte then zero pad + casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + + // Zero pad full vectors up to count + for ( ; vp < 6; vp++ ) + casti_m512i( buf, vp ) = m512_zero; + + // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 + // Count is misaligned to 16 bits and straddles a vector. + // Use u32 overlay to stage then u16 to load buf. + count.u32[0] = ctx->count0 += (ctx->ptr << 1); // ptr/4 * 8 + count.u32[1] = ctx->count1; + count.u32[2] = ctx->count2; + count.u32[3] = ctx->count3; + + casti_m512i( buf, 6 ) = m512_const1_128( + _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16( + 0x0200, count.u16[7], count.u16[6], count.u16[5], + count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); + + c512_4way( ctx, buf); + + casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 ); + casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 ); + casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 ); + casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 ); +} + +void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, + const void *data, size_t len ) +{ + unsigned char *buf = ctx->buf; + size_t ptr = ctx->ptr; + + // process full blocks and load buf with remainder. + while ( len > 0 ) + { + size_t clen; + + clen = (sizeof ctx->buf) - ptr; + if ( clen > len << 2 ) + clen = len << 2; + memcpy( buf + ptr, data, clen ); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= (clen >> 2); + if ( ptr == sizeof ctx->buf ) + { + if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) + { + ctx->count1 = ctx->count1 + 1; + if ( ctx->count1 == 0 ) + { + ctx->count2 = ctx->count2 + 1; + if ( ctx->count2 == 0 ) + ctx->count3 = ctx->count3 + 1; + } + } + c512_4way( ctx, buf ); + ptr = 0; + } + } + + uint32_t vp = ptr>>6; + // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 + // Count is misaligned to 16 bits and straddles 2 vectors. + // Use u32 overlay to stage then u16 to load buf. + union + { + uint32_t u32[4]; + uint16_t u16[8]; + } count; + + count.u32[0] = ctx->count0 += (ptr << 1); // ptr/4 * 8 + count.u32[1] = ctx->count1; + count.u32[2] = ctx->count2; + count.u32[3] = ctx->count3; + + if ( vp == 0 ) // empty buf, xevan. + { + casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 ); + memset_zero_512( (__m512i*)buf + 1, 5 ); + ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; + } + else // half full buf, everyone else. + { + casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + memset_zero_512( (__m512i*)buf + vp, 6 - vp ); + } + + casti_m512i( buf, 6 ) = m512_const1_128( + _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16( + 0x0200, count.u16[7], count.u16[6], count.u16[5], + count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); + + c512_4way( ctx, buf); + + casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 ); + casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 ); + casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 ); + casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 ); +} + +#endif // VAES diff --git a/algo/shavite/shavite-hash-4way.h b/algo/shavite/shavite-hash-4way.h new file mode 100644 index 0000000..c179566 --- /dev/null +++ b/algo/shavite/shavite-hash-4way.h @@ -0,0 +1,25 @@ +#ifndef SHAVITE_HASH_4WAY_H__ +#define SHAVITE_HASH_4WAY_H__ 1 + +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + +#include "simd-utils.h" + +typedef struct { + unsigned char buf[128<<2]; + uint32_t h[16<<2]; + size_t ptr; + uint32_t count0, count1, count2, count3; +} shavite512_4way_context __attribute__ ((aligned (64))); + +void shavite512_4way_init( shavite512_4way_context *ctx ); +void shavite512_4way_update( shavite512_4way_context *ctx, const void *data, + size_t len ); +void shavite512_4way_close( shavite512_4way_context *ctx, void *dst ); +void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, + const void *data, size_t len ); + +#endif // VAES + +#endif // SHAVITE_HASH_4WAY_H__ + diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index a992789..c040e15 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -18,76 +18,18 @@ void skeinhash_8way( void *state, const void *input ) uint64_t vhash64[8*8] __attribute__ ((aligned (128))); skein512_8way_context ctx_skein; -//#if defined(__SHA__) -// uint32_t hash0[16] __attribute__ ((aligned (64))); -// uint32_t hash1[16] __attribute__ ((aligned (64))); -// uint32_t hash2[16] __attribute__ ((aligned (64))); -// uint32_t hash3[16] __attribute__ ((aligned (64))); -// uint32_t hash4[16] __attribute__ ((aligned (64))); -// uint32_t hash5[16] __attribute__ ((aligned (64))); -// uint32_t hash6[16] __attribute__ ((aligned (64))); -// uint32_t hash7[16] __attribute__ ((aligned (64))); -// SHA256_CTX ctx_sha256; -//#else uint32_t vhash32[16*8] __attribute__ ((aligned (128))); sha256_8way_context ctx_sha256; -//#endif skein512_8way_init( &ctx_skein ); skein512_8way_update( &ctx_skein, input, 80 ); skein512_8way_close( &ctx_skein, vhash64 ); -/* -#if defined(__SHA__) - dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash64, 512 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 ); - SHA256_Final( (unsigned char*)hash0, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 ); - SHA256_Final( (unsigned char*)hash1, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 ); - SHA256_Final( (unsigned char*)hash2, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 ); - SHA256_Final( (unsigned char*)hash3, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 ); - SHA256_Final( (unsigned char*)hash4, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 ); - SHA256_Final( (unsigned char*)hash5, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 ); - SHA256_Final( (unsigned char*)hash6, &ctx_sha256 ); - - SHA256_Init( &ctx_sha256 ); - SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 ); - SHA256_Final( (unsigned char*)hash7, &ctx_sha256 ); - - intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, 256 ); -#else -*/ rintrlv_8x64_8x32( vhash32, vhash64, 512 ); -// dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, -// vhash64, 512 ); -// intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6, -// hash7, 512 ); sha256_8way_init( &ctx_sha256 ); - sha256_8way( &ctx_sha256, vhash32, 64 ); + sha256_8way_update( &ctx_sha256, vhash32, 64 ); sha256_8way_close( &ctx_sha256, state ); -//#endif } int scanhash_skein_8way( struct work *work, uint32_t max_nonce, @@ -176,7 +118,7 @@ void skeinhash_4way( void *state, const void *input ) rintrlv_4x64_4x32( vhash32, vhash64, 512 ); sha256_4way_init( &ctx_sha256 ); - sha256_4way( &ctx_sha256, vhash32, 64 ); + sha256_4way_update( &ctx_sha256, vhash32, 64 ); sha256_4way_close( &ctx_sha256, state ); #endif } diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index 4f828a1..3f58e95 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context; void skein512_4way_init( skein512_4way_context *sc ); void skein512_4way_update( void *cc, const void *data, size_t len ); void skein512_4way_close( void *cc, void *dst ); -#define skein512_4way skein512_4way_update +//#define skein512_4way skein512_4way_update void skein256_4way_init( skein256_4way_context *sc ); void skein256_4way_update( void *cc, const void *data, size_t len ); void skein256_4way_close( void *cc, void *dst ); -#define skein256_4way skein256_4way_update +//#define skein256_4way skein256_4way_update #ifdef __cplusplus } diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index a51508b..b2a7962 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input ) uint64_t hash[16*4] __attribute__ ((aligned (64))); skein512_4way_init( &ctx ); - skein512_4way( &ctx, input, 80 ); + skein512_4way_update( &ctx, input, 80 ); skein512_4way_close( &ctx, hash ); skein512_4way_init( &ctx ); - skein512_4way( &ctx, hash, 64 ); + skein512_4way_update( &ctx, hash, 64 ); skein512_4way_close( &ctx, output ); } diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c index f900aba..6e17d1b 100644 --- a/algo/sm3/sm3-hash-4way.c +++ b/algo/sm3/sm3-hash-4way.c @@ -50,41 +50,138 @@ #include #include "sm3-hash-4way.h" -#ifdef __SSE4_2__ +#ifdef __AVX2__ -void sm3_4way_init( sm3_4way_ctx_t *ctx ) +#define P0_8W(x) \ + _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 9 ), \ + mm256_rol_32( x, 17 ) ) ) + +#define P1_8W(x) \ + _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \ + mm256_rol_32( x, 23 ) ) ) + +#define FF0_8W(x,y,z) \ + _mm256_xor_si256( x, _mm256_xor_si256( y, z ) ) + +#define FF1_8W(x,y,z) \ + _mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \ + _mm256_and_si256( x, z ) ), \ + _mm256_and_si256( y, z ) ) + +#define GG0_8W(x,y,z) FF0_8W(x,y,z) + +#define GG1_8W(x,y,z) \ + _mm256_or_si256( _mm256_and_si256( x, y ), \ + _mm256_andnot_si256( x, z ) ) + +void sm3_8way_compress( __m256i *digest, __m256i *block ) { - ctx->digest[0] = _mm_set1_epi32( 0x7380166F ); - ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 ); - ctx->digest[2] = _mm_set1_epi32( 0x172442D7 ); - ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 ); - ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC ); - ctx->digest[5] = _mm_set1_epi32( 0x163138AA ); - ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D ); - ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E ); - ctx->nblocks = 0; - ctx->num = 0; + __m256i W[68], W1[64]; + __m256i A = digest[ 0 ]; + __m256i B = digest[ 1 ]; + __m256i C = digest[ 2 ]; + __m256i D = digest[ 3 ]; + __m256i E = digest[ 4 ]; + __m256i F = digest[ 5 ]; + __m256i G = digest[ 6 ]; + __m256i H = digest[ 7 ]; + __m256i SS1, SS2, TT1, TT2, T; + int j; + + for ( j = 0; j < 16; j++ ) + W[j] = mm256_bswap_32( block[j] ); + + for ( j = 16; j < 68; j++ ) + W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256( + _mm256_xor_si256( W[ j-16 ], W[ j-9 ] ), + mm256_rol_32( W[ j-3 ], 15 ) ) ), + _mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) ); + + for( j = 0; j < 64; j++ ) + W1[j] = _mm256_xor_si256( W[j], W[j+4] ); + + T = _mm256_set1_epi32( 0x79CC4519UL ); + for( j =0; j < 16; j++ ) + { + SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32( + mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 ); + SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) ); + TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( + FF0_8W( A, B, C ), D ), SS2 ), W1[j] ); + TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( + GG0_8W( E, F, G ), H ), SS1 ), W[j] ); + D = C; + C = mm256_rol_32( B, 9 ); + B = A; + A = TT1; + H = G; + G = mm256_rol_32( F, 19 ); + F = E; + E = P0_8W( TT2 ); + } + + T = _mm256_set1_epi32( 0x7A879D8AUL ); + for( j =16; j < 64; j++ ) + { + SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( + mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 ); + SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) ); + TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( + FF1_8W( A, B, C ), D ), SS2 ), W1[j] ); + TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( + GG1_8W( E, F, G ), H ), SS1 ), W[j] ); + D = C; + C = mm256_rol_32( B, 9 ); + B = A; + A = TT1; + H = G; + G = mm256_rol_32( F, 19 ); + F = E; + E = P0_8W( TT2 ); + } + + digest[0] = _mm256_xor_si256( digest[0], A ); + digest[1] = _mm256_xor_si256( digest[1], B ); + digest[2] = _mm256_xor_si256( digest[2], C ); + digest[3] = _mm256_xor_si256( digest[3], D ); + digest[4] = _mm256_xor_si256( digest[4], E ); + digest[5] = _mm256_xor_si256( digest[5], F ); + digest[6] = _mm256_xor_si256( digest[6], G ); + digest[7] = _mm256_xor_si256( digest[7], H ); } -void sm3_4way( void *cc, const void *data, size_t len ) +void sm3_8way_init( sm3_8way_ctx_t *ctx ) { - sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; - __m128i *block = (__m128i*)ctx->block; - __m128i *vdata = (__m128i*)data; + ctx->digest[0] = _mm256_set1_epi32( 0x7380166F ); + ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 ); + ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 ); + ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 ); + ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC ); + ctx->digest[5] = _mm256_set1_epi32( 0x163138AA ); + ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D ); + ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E ); + ctx->nblocks = 0; + ctx->num = 0; +} +void sm3_8way_update( void *cc, const void *data, size_t len ) +{ + sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc; + __m256i *block = (__m256i*)ctx->block; + __m256i *vdata = (__m256i*)data; if ( ctx->num ) { unsigned int left = SM3_BLOCK_SIZE - ctx->num; if ( len < left ) { - memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); + memcpy_256( block + (ctx->num >> 2), vdata , len>>2 ); ctx->num += len; return; } else { - memcpy_128( block + (ctx->num >> 2), vdata , left>>2 ); - sm3_4way_compress( ctx->digest, block ); + memcpy_256( block + (ctx->num >> 2), vdata , left>>2 ); + sm3_8way_compress( ctx->digest, block ); ctx->nblocks++; vdata += left>>2; len -= left; @@ -92,49 +189,53 @@ void sm3_4way( void *cc, const void *data, size_t len ) } while ( len >= SM3_BLOCK_SIZE ) { - sm3_4way_compress( ctx->digest, vdata ); + sm3_8way_compress( ctx->digest, vdata ); ctx->nblocks++; vdata += SM3_BLOCK_SIZE>>2; len -= SM3_BLOCK_SIZE; } ctx->num = len; if ( len ) - memcpy_128( block, vdata, len>>2 ); + memcpy_256( block, vdata, len>>2 ); } -void sm3_4way_close( void *cc, void *dst ) +void sm3_8way_close( void *cc, void *dst ) { - sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; - __m128i *hash = (__m128i*)dst; - __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) ); - __m128i *block = (__m128i*)ctx->block; + sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc; + __m256i *hash = (__m256i*)dst; + __m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) ); + __m256i *block = (__m256i*)ctx->block; int i; - block[ctx->num] = _mm_set1_epi32( 0x80 ); + block[ctx->num] = _mm256_set1_epi32( 0x80 ); if ( ctx->num + 8 <= SM3_BLOCK_SIZE ) { - memset_zero_128( block + (ctx->num >> 2) + 1, - ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); + memset_zero_256( block + (ctx->num >> 2) + 1, + ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); } else { - memset_zero_128( block + (ctx->num >> 2) + 1, + memset_zero_256( block + (ctx->num >> 2) + 1, ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) ); - sm3_4way_compress( ctx->digest, block ); - memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); + sm3_8way_compress( ctx->digest, block ); + memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); } - count[0] = mm128_bswap_32( - _mm_set1_epi32( ctx->nblocks >> 23 ) ); - count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + + count[0] = mm256_bswap_32( + _mm256_set1_epi32( ctx->nblocks >> 23 ) ); + count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) + ( ctx->num << 3 ) ) ); - sm3_4way_compress( ctx->digest, block ); + sm3_8way_compress( ctx->digest, block ); for ( i = 0; i < 8 ; i++ ) - hash[i] = mm128_bswap_32( ctx->digest[i] ); + hash[i] = mm256_bswap_32( ctx->digest[i] ); } +#endif + +#if defined(__SSE2__) + #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \ mm128_rol_32( x, 17 ) ) ) #define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \ @@ -227,5 +328,88 @@ void sm3_4way_compress( __m128i *digest, __m128i *block ) digest[7] = _mm_xor_si128( digest[7], H ); } +void sm3_4way_init( sm3_4way_ctx_t *ctx ) +{ + ctx->digest[0] = _mm_set1_epi32( 0x7380166F ); + ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 ); + ctx->digest[2] = _mm_set1_epi32( 0x172442D7 ); + ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 ); + ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC ); + ctx->digest[5] = _mm_set1_epi32( 0x163138AA ); + ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D ); + ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E ); + ctx->nblocks = 0; + ctx->num = 0; +} + +void sm3_4way_update( void *cc, const void *data, size_t len ) +{ + sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; + __m128i *block = (__m128i*)ctx->block; + __m128i *vdata = (__m128i*)data; + + if ( ctx->num ) + { + unsigned int left = SM3_BLOCK_SIZE - ctx->num; + if ( len < left ) + { + memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); + ctx->num += len; + return; + } + else + { + memcpy_128( block + (ctx->num >> 2), vdata , left>>2 ); + sm3_4way_compress( ctx->digest, block ); + ctx->nblocks++; + vdata += left>>2; + len -= left; + } + } + while ( len >= SM3_BLOCK_SIZE ) + { + sm3_4way_compress( ctx->digest, vdata ); + ctx->nblocks++; + vdata += SM3_BLOCK_SIZE>>2; + len -= SM3_BLOCK_SIZE; + } + ctx->num = len; + if ( len ) + memcpy_128( block, vdata, len>>2 ); +} + +void sm3_4way_close( void *cc, void *dst ) +{ + sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc; + __m128i *hash = (__m128i*)dst; + __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) ); + __m128i *block = (__m128i*)ctx->block; + int i; + + block[ctx->num] = _mm_set1_epi32( 0x80 ); + + if ( ctx->num + 8 <= SM3_BLOCK_SIZE ) + { + memset_zero_128( block + (ctx->num >> 2) + 1, + ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); + } + else + { + memset_zero_128( block + (ctx->num >> 2) + 1, + ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) ); + sm3_4way_compress( ctx->digest, block ); + memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); + } + + count[0] = mm128_bswap_32( + _mm_set1_epi32( ctx->nblocks >> 23 ) ); + count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + + ( ctx->num << 3 ) ) ); + sm3_4way_compress( ctx->digest, block ); + + for ( i = 0; i < 8 ; i++ ) + hash[i] = mm128_bswap_32( ctx->digest[i] ); +} + #endif diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h index 06159d8..abe1dfd 100644 --- a/algo/sm3/sm3-hash-4way.h +++ b/algo/sm3/sm3-hash-4way.h @@ -48,14 +48,13 @@ */ #ifndef SPH_SM3_HASH_4WAY_H -#define SPH_SM3_HASH_4WAY_H +#define SPH_SM3_HASH_4WAY_H 1 #define SM3_DIGEST_LENGTH 32 #define SM3_BLOCK_SIZE 64 #define SM3_CBLOCK (SM3_BLOCK_SIZE) #define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH) - #include #include #include @@ -65,7 +64,6 @@ extern "C" { #endif - typedef struct { __m128i block[16] __attribute__ ((aligned (64))); __m128i digest[8]; @@ -74,15 +72,24 @@ typedef struct { } sm3_4way_ctx_t; void sm3_4way_init( sm3_4way_ctx_t *ctx ); -//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data, -// size_t data_len ); -//void sm3_4way_final( sm3_4way_ctx_t *ctx, -// unsigned char digest[SM3_DIGEST_LENGTH] ); -void sm3_4way_compress( __m128i *digest, __m128i *block ); - -void sm3_4way(void *cc, const void *data, size_t len); +void sm3_4way_update(void *cc, const void *data, size_t len); void sm3_4way_close(void *cc, void *dst); +#if defined(__AVX2__) + +typedef struct { + __m256i block[16] __attribute__ ((aligned (64))); + __m256i digest[8]; + uint32_t nblocks; + uint32_t num; +} sm3_8way_ctx_t; + +void sm3_8way_init( sm3_8way_ctx_t *ctx ); +void sm3_8way_update(void *cc, const void *data, size_t len); +void sm3_8way_close(void *cc, void *dst); + +#endif + #ifdef __cplusplus } #endif diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c index fcae00c..529bac4 100644 --- a/algo/x11/c11-4way.c +++ b/algo/x11/c11-4way.c @@ -14,21 +14,32 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined (C11_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; hashState_echo echo; +#endif } c11_8way_ctx_holder; c11_8way_ctx_holder c11_8way_ctx; @@ -37,22 +48,28 @@ void init_c11_8way_ctx() { blake512_8way_init( &c11_8way_ctx.blake ); bmw512_8way_init( &c11_8way_ctx.bmw ); - init_groestl( &c11_8way_ctx.groestl, 64 ); skein512_8way_init( &c11_8way_ctx.skein ); jh512_8way_init( &c11_8way_ctx.jh ); keccak512_8way_init( &c11_8way_ctx.keccak ); luffa_4way_init( &c11_8way_ctx.luffa, 512 ); cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &c11_8way_ctx.shavite ); simd_4way_init( &c11_8way_ctx.simd, 512 ); +#if defined(__VAES__) + groestl512_4way_init( &c11_8way_ctx.groestl, 64 ); + shavite512_4way_init( &c11_8way_ctx.shavite ); + echo_4way_init( &c11_8way_ctx.echo, 512 ); +#else + init_groestl( &c11_8way_ctx.groestl, 64 ); + sph_shavite512_init( &c11_8way_ctx.shavite ); init_echo( &c11_8way_ctx.echo, 512 ); +#endif } void c11_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -72,11 +89,21 @@ void c11_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - // Serial +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - // 3 Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); @@ -93,10 +120,11 @@ void c11_8way_hash( void *state, const void *input ) memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - // 4way intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + // 4 JH jh512_8way_update( &ctx.jh, vhash, 64 ); jh512_8way_close( &ctx.jh, vhash ); @@ -109,20 +137,27 @@ void c11_8way_hash( void *state, const void *input ) skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - // 9 Shavite sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); memcpy( &ctx.shavite, &c11_8way_ctx.shavite, @@ -154,16 +189,29 @@ void c11_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - // 10 Simd - intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 ); - intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); - // 11 Echo +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) ); @@ -188,6 +236,8 @@ void c11_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); +#endif + memcpy( state, hash0, 32 ); memcpy( state+ 32, hash1, 32 ); memcpy( state+ 64, hash2, 32 ); @@ -282,11 +332,11 @@ void c11_4way_hash( void *state, const void *input ) memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) ); // 1 Blake 4way - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -305,15 +355,15 @@ void c11_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 5 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // 6 Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // Serial diff --git a/algo/x11/c11-gate.c b/algo/x11/c11-gate.c index be0750e..f9d50ce 100644 --- a/algo/x11/c11-gate.c +++ b/algo/x11/c11-gate.c @@ -15,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_c11; gate->hash = (void*)&c11_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index d1f51c5..94d36f7 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input) switch ( permutation[i] ) { case 0: - blake512_4way( &ctx.blake, vhashA, dataLen ); + blake512_4way_update( &ctx.blake, vhashA, dataLen ); blake512_4way_close( &ctx.blake, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 1: - bmw512_4way( &ctx.bmw, vhashA, dataLen ); + bmw512_4way_update( &ctx.bmw, vhashA, dataLen ); bmw512_4way_close( &ctx.bmw, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); @@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; case 3: - skein512_4way( &ctx.skein, vhashA, dataLen ); + skein512_4way_update( &ctx.skein, vhashA, dataLen ); skein512_4way_close( &ctx.skein, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 4: - jh512_4way( &ctx.jh, vhashA, dataLen ); + jh512_4way_update( &ctx.jh, vhashA, dataLen ); jh512_4way_close( &ctx.jh, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 5: - keccak512_4way( &ctx.keccak, vhashA, dataLen ); + keccak512_4way_update( &ctx.keccak, vhashA, dataLen ); keccak512_4way_close( &ctx.keccak, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index f4c016d..9353124 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input) switch ( permutation[i] ) { case 0: - blake512_4way( &ctx.blake, vhashA, dataLen ); + blake512_4way_update( &ctx.blake, vhashA, dataLen ); blake512_4way_close( &ctx.blake, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 1: - bmw512_4way( &ctx.bmw, vhashA, dataLen ); + bmw512_4way_update( &ctx.bmw, vhashA, dataLen ); bmw512_4way_close( &ctx.bmw, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); @@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; case 3: - skein512_4way( &ctx.skein, vhashA, dataLen ); + skein512_4way_update( &ctx.skein, vhashA, dataLen ); skein512_4way_close( &ctx.skein, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 4: - jh512_4way( &ctx.jh, vhashA, dataLen ); + jh512_4way_update( &ctx.jh, vhashA, dataLen ); jh512_4way_close( &ctx.jh, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; case 5: - keccak512_4way( &ctx.keccak, vhashA, dataLen ); + keccak512_4way_update( &ctx.keccak, vhashA, dataLen ); keccak512_4way_close( &ctx.keccak, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c index 4be0286..df49600 100644 --- a/algo/x11/tribus-4way.c +++ b/algo/x11/tribus-4way.c @@ -6,6 +6,9 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(TRIBUS_8WAY) @@ -14,6 +17,8 @@ static __thread jh512_8way_context ctx_mid; void tribus_hash_8way( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -24,7 +29,11 @@ void tribus_hash_8way( void *state, const void *input ) uint64_t hash7[8] __attribute__ ((aligned (64))); jh512_8way_context ctx_jh; keccak512_8way_context ctx_keccak; +#if defined(__VAES__) + echo_4way_context ctx_echo; +#else hashState_echo ctx_echo; +#endif memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) ); jh512_8way_update( &ctx_jh, input + (64<<3), 16 ); @@ -34,10 +43,23 @@ void tribus_hash_8way( void *state, const void *input ) keccak512_8way_update( &ctx_keccak, vhash, 64 ); keccak512_8way_close( &ctx_keccak, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + echo_4way_init( &ctx_echo, 512 ); + echo_4way_update_close( &ctx_echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx_echo, 512 ); + echo_4way_update_close( &ctx_echo, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + +#else + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 512 ); - // hash echo serially init_echo( &ctx_echo, 512 ); update_final_echo( &ctx_echo, (BitSequence *) hash0, (const BitSequence *) hash0, 512 ); @@ -63,6 +85,8 @@ void tribus_hash_8way( void *state, const void *input ) update_final_echo( &ctx_echo, (BitSequence *) hash7, (const BitSequence *) hash7, 512 ); +#endif + memcpy( state, hash0, 32 ); memcpy( state+32, hash1, 32 ); memcpy( state+64, hash2, 32 ); diff --git a/algo/x11/tribus-gate.c b/algo/x11/tribus-gate.c index 794ec31..3d8d171 100644 --- a/algo/x11/tribus-gate.c +++ b/algo/x11/tribus-gate.c @@ -2,7 +2,7 @@ bool register_tribus_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; #if defined (TRIBUS_8WAY) gate->scanhash = (void*)&scanhash_tribus_8way; gate->hash = (void*)&tribus_hash_8way; diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index a30cbc0..2fe47a7 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -14,21 +14,32 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined (X11_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; hashState_echo echo; +#endif } x11_8way_ctx_holder; x11_8way_ctx_holder x11_8way_ctx; @@ -37,22 +48,28 @@ void init_x11_8way_ctx() { blake512_8way_init( &x11_8way_ctx.blake ); bmw512_8way_init( &x11_8way_ctx.bmw ); - init_groestl( &x11_8way_ctx.groestl, 64 ); skein512_8way_init( &x11_8way_ctx.skein ); jh512_8way_init( &x11_8way_ctx.jh ); keccak512_8way_init( &x11_8way_ctx.keccak ); luffa_4way_init( &x11_8way_ctx.luffa, 512 ); cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11_8way_ctx.shavite ); simd_4way_init( &x11_8way_ctx.simd, 512 ); +#if defined(__VAES__) + groestl512_4way_init( &x11_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x11_8way_ctx.shavite ); + echo_4way_init( &x11_8way_ctx.echo, 512 ); +#else + init_groestl( &x11_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x11_8way_ctx.shavite ); init_echo( &x11_8way_ctx.echo, 512 ); +#endif } void x11_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -61,7 +78,6 @@ void x11_8way_hash( void *state, const void *input ) uint64_t hash5[8] __attribute__ ((aligned (64))); uint64_t hash6[8] __attribute__ ((aligned (64))); uint64_t hash7[8] __attribute__ ((aligned (64))); - x11_8way_ctx_holder ctx; memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) ); blake512_8way_update( &ctx.blake, input, 80 ); @@ -70,7 +86,18 @@ void x11_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - // Serial +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -97,10 +124,11 @@ void x11_8way_hash( void *state, const void *input ) sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - // 4way intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -110,18 +138,26 @@ void x11_8way_hash( void *state, const void *input ) keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -154,13 +190,28 @@ void x11_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); @@ -186,6 +237,8 @@ void x11_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); +#endif + memcpy( state, hash0, 32 ); memcpy( state+ 32, hash1, 32 ); memcpy( state+ 64, hash2, 32 ); @@ -282,11 +335,11 @@ void x11_4way_hash( void *state, const void *input ) memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) ); // 1 Blake 4way - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -305,15 +358,15 @@ void x11_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c index 132996a..50b5480 100644 --- a/algo/x11/x11-gate.c +++ b/algo/x11/x11-gate.c @@ -15,7 +15,7 @@ bool register_x11_algo( algo_gate_t *gate ) gate->scanhash = (void*)&scanhash_x11; gate->hash = (void*)&x11_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ; return true; }; diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c index 8fe1512..11e5366 100644 --- a/algo/x11/x11evo-4way.c +++ b/algo/x11/x11evo-4way.c @@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input ) switch ( idx ) { case 0: - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); break; case 1: - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); if ( i >= len-1 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); @@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); break; case 3: - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); if ( i >= len-1 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); break; case 4: - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); if ( i >= len-1 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); break; case 5: - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); if ( i >= len-1 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c index f3713d7..3cf5b67 100644 --- a/algo/x11/x11gost-4way.c +++ b/algo/x11/x11gost-4way.c @@ -15,22 +15,33 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined (X11GOST_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; sph_gost512_context gost; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; hashState_echo echo; +#endif } x11gost_8way_ctx_holder; x11gost_8way_ctx_holder x11gost_8way_ctx; @@ -39,21 +50,29 @@ void init_x11gost_8way_ctx() { blake512_8way_init( &x11gost_8way_ctx.blake ); bmw512_8way_init( &x11gost_8way_ctx.bmw ); - init_groestl( &x11gost_8way_ctx.groestl, 64 ); skein512_8way_init( &x11gost_8way_ctx.skein ); jh512_8way_init( &x11gost_8way_ctx.jh ); keccak512_8way_init( &x11gost_8way_ctx.keccak ); sph_gost512_init( &x11gost_8way_ctx.gost ); luffa_4way_init( &x11gost_8way_ctx.luffa, 512 ); cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11gost_8way_ctx.shavite ); simd_4way_init( &x11gost_8way_ctx.simd, 512 ); +#if defined(__VAES__) + groestl512_4way_init( &x11gost_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x11gost_8way_ctx.shavite ); + echo_4way_init( &x11gost_8way_ctx.echo, 512 ); +#else + init_groestl( &x11gost_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x11gost_8way_ctx.shavite ); init_echo( &x11gost_8way_ctx.echo, 512 ); +#endif } void x11gost_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -72,7 +91,18 @@ void x11gost_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - // Serial +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -99,10 +129,11 @@ void x11gost_8way_hash( void *state, const void *input ) sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - // 4way intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -140,20 +171,28 @@ void x11gost_8way_hash( void *state, const void *input ) sph_gost512( &ctx.gost, hash7, 64 ); sph_gost512_close( &ctx.gost, hash7 ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); - // Luffa + Cube - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, 64 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - sph_shavite512( &ctx.shavite, hash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + sph_shavite512_close( &ctx.shavite, hash0 ); memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite, sizeof(sph_shavite512_context) ); @@ -184,14 +223,29 @@ void x11gost_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) ); @@ -216,6 +270,8 @@ void x11gost_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); +#endif + memcpy( state, hash0, 32 ); memcpy( state+ 32, hash1, 32 ); memcpy( state+ 64, hash2, 32 ); @@ -310,10 +366,10 @@ void x11gost_4way_hash( void *state, const void *input ) x11gost_4way_ctx_holder ctx; memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -333,13 +389,13 @@ void x11gost_4way_hash( void *state, const void *input ) // 4way intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // Serial diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c index ed4d131..8ae7960 100644 --- a/algo/x12/x12-4way.c +++ b/algo/x12/x12-4way.c @@ -16,6 +16,11 @@ #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(X12_8WAY) @@ -23,16 +28,22 @@ typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } x12_8way_ctx_holder; x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64))); @@ -41,31 +52,29 @@ void init_x12_8way_ctx() { blake512_8way_init( &x12_8way_ctx.blake ); bmw512_8way_init( &x12_8way_ctx.bmw ); - init_groestl( &x12_8way_ctx.groestl, 64 ); skein512_8way_init( &x12_8way_ctx.skein ); jh512_8way_init( &x12_8way_ctx.jh ); keccak512_8way_init( &x12_8way_ctx.keccak ); luffa_4way_init( &x12_8way_ctx.luffa, 512 ); cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x12_8way_ctx.shavite ); simd_4way_init( &x12_8way_ctx.simd, 512 ); - init_echo( &x12_8way_ctx.echo, 512 ); hamsi512_8way_init( &x12_8way_ctx.hamsi ); +#if defined(__VAES__) + groestl512_4way_init( &x12_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x12_8way_ctx.shavite ); + echo_4way_init( &x12_8way_ctx.echo, 512 ); +#else + init_groestl( &x12_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x12_8way_ctx.shavite ); + init_echo( &x12_8way_ctx.echo, 512 ); +#endif }; void x12_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); - uint64_t hash0[8] __attribute__ ((aligned (64))); - uint64_t hash1[8] __attribute__ ((aligned (64))); - uint64_t hash2[8] __attribute__ ((aligned (64))); - uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t hash4[8] __attribute__ ((aligned (64))); - uint64_t hash5[8] __attribute__ ((aligned (64))); - uint64_t hash6[8] __attribute__ ((aligned (64))); - uint64_t hash7[8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); x12_8way_ctx_holder ctx; memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) ); @@ -75,18 +84,36 @@ void x12_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -119,14 +146,35 @@ void x12_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *) hash1, 512 ); memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); @@ -174,6 +222,8 @@ void x12_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -272,10 +322,10 @@ void x12_4way_hash( void *state, const void *input ) x12_4way_ctx_holder ctx; memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -328,16 +378,16 @@ void x12_4way_hash( void *state, const void *input ) // Parallel 4way 64 bit intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 ); diff --git a/algo/x12/x12-gate.c b/algo/x12/x12-gate.c index f495747..706cf54 100644 --- a/algo/x12/x12-gate.c +++ b/algo/x12/x12-gate.c @@ -15,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x12; gate->hash = (void*)&x12hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c index 7750e75..eb143e8 100644 --- a/algo/x13/phi1612-4way.c +++ b/algo/x13/phi1612-4way.c @@ -10,6 +10,9 @@ #include "algo/fugue/sph_fugue.h" #include "algo/gost/sph_gost.h" #include "algo/echo/aes_ni/hash_api.h" +#if defined(__VAES__) + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(PHI1612_8WAY) @@ -19,7 +22,11 @@ typedef struct { cube_4way_context cube; sph_fugue512_context fugue; sph_gost512_context gost; +#if defined(__VAES__) + echo_4way_context echo; +#else hashState_echo echo; +#endif } phi1612_8way_ctx_holder; phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64))); @@ -31,7 +38,11 @@ void init_phi1612_8way_ctx() cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 ); sph_fugue512_init( &phi1612_8way_ctx.fugue ); sph_gost512_init( &phi1612_8way_ctx.gost ); +#if defined(__VAES__) + echo_4way_init( &phi1612_8way_ctx.echo, 512 ); +#else init_echo( &phi1612_8way_ctx.echo, 512 ); +#endif }; void phi1612_8way_hash( void *state, const void *input ) @@ -118,6 +129,19 @@ void phi1612_8way_hash( void *state, const void *input ) sph_gost512_close( &ctx.gost, hash7 ); // Echo + +#if defined(__VAES__) + + intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + +#else + update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); init_echo( &ctx.echo, 512 ); @@ -142,6 +166,8 @@ void phi1612_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); +#endif + memcpy( state, hash0, 32 ); memcpy( state+ 32, hash1, 32 ); memcpy( state+ 64, hash2, 32 ); @@ -225,11 +251,11 @@ void phi1612_4way_hash( void *state, const void *input ) memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) ); // Skein parallel 4way - skein512_4way( &ctx.skein, input, 80 ); + skein512_4way_update( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); // JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // Serial to the end diff --git a/algo/x13/phi1612-gate.c b/algo/x13/phi1612-gate.c index 1cfe3fa..ef3e772 100644 --- a/algo/x13/phi1612-gate.c +++ b/algo/x13/phi1612-gate.c @@ -15,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_phi1612; gate->hash = (void*)&phi1612_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c index 81899d0..566f545 100644 --- a/algo/x13/skunk-4way.c +++ b/algo/x13/skunk-4way.c @@ -168,7 +168,7 @@ void skunk_4way_hash( void *output, const void *input ) skunk_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) ); - skein512_4way( &ctx.skein, input, 80 ); + skein512_4way_update( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c index 40b4b5b..2173f01 100644 --- a/algo/x13/x13-4way.c +++ b/algo/x13/x13-4way.c @@ -17,23 +17,34 @@ #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(X13_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } x13_8way_ctx_holder; x13_8way_ctx_holder x13_8way_ctx; @@ -42,24 +53,30 @@ void init_x13_8way_ctx() { blake512_8way_init( &x13_8way_ctx.blake ); bmw512_8way_init( &x13_8way_ctx.bmw ); - init_groestl( &x13_8way_ctx.groestl, 64 ); skein512_8way_init( &x13_8way_ctx.skein ); jh512_8way_init( &x13_8way_ctx.jh ); keccak512_8way_init( &x13_8way_ctx.keccak ); luffa_4way_init( &x13_8way_ctx.luffa, 512 ); cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x13_8way_ctx.shavite ); simd_4way_init( &x13_8way_ctx.simd, 512 ); - init_echo( &x13_8way_ctx.echo, 512 ); hamsi512_8way_init( &x13_8way_ctx.hamsi ); sph_fugue512_init( &x13_8way_ctx.fugue ); +#if defined(__VAES__) + groestl512_4way_init( &x13_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x13_8way_ctx.shavite ); + echo_4way_init( &x13_8way_ctx.echo, 512 ); +#else + init_groestl( &x13_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x13_8way_ctx.shavite ); + init_echo( &x13_8way_ctx.echo, 512 ); +#endif } void x13_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -76,6 +93,19 @@ void x13_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -104,6 +134,9 @@ void x13_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -112,21 +145,27 @@ void x13_8way_hash( void *state, const void *input ) keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -159,13 +198,27 @@ void x13_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); @@ -193,6 +246,9 @@ void x13_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, @@ -321,11 +377,11 @@ void x13_4way_hash( void *state, const void *input ) memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) ); // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -344,15 +400,15 @@ void x13_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // Serial @@ -416,7 +472,7 @@ void x13_4way_hash( void *state, const void *input ) // 12 Hamsi parallel 4way 32 bit intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c index 366185c..ad8abdb 100644 --- a/algo/x13/x13-gate.c +++ b/algo/x13/x13-gate.c @@ -15,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x13; gate->hash = (void*)&x13hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x13/x13bcd-4way.c b/algo/x13/x13bcd-4way.c index 706ea6f..01fb8a6 100644 --- a/algo/x13/x13bcd-4way.c +++ b/algo/x13/x13bcd-4way.c @@ -1,7 +1,4 @@ #include "x13sm3-gate.h" - -#if defined(X13SM3_4WAY) - #include #include #include @@ -13,12 +10,328 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/sm3/sm3-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif + +#if defined(X13BCD_8WAY) + +typedef struct { + blake512_8way_context blake; + bmw512_8way_context bmw; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + cube_4way_context cube; + simd_4way_context simd; + sm3_8way_ctx_t sm3; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif +} x13bcd_8way_ctx_holder; + +x13bcd_8way_ctx_holder x13bcd_8way_ctx __attribute__ ((aligned (64))); +static __thread blake512_8way_context x13bcd_8way_ctx_mid; + +void init_x13bcd_8way_ctx() +{ + blake512_8way_init( &x13bcd_8way_ctx.blake ); + bmw512_8way_init( &x13bcd_8way_ctx.bmw ); + skein512_8way_init( &x13bcd_8way_ctx.skein ); + jh512_8way_init( &x13bcd_8way_ctx.jh ); + keccak512_8way_init( &x13bcd_8way_ctx.keccak ); + cube_4way_init( &x13bcd_8way_ctx.cube, 512, 16, 32 ); + simd_4way_init( &x13bcd_8way_ctx.simd, 512 ); + sm3_8way_init( &x13bcd_8way_ctx.sm3 ); + hamsi512_8way_init( &x13bcd_8way_ctx.hamsi ); + sph_fugue512_init( &x13bcd_8way_ctx.fugue ); +#if defined(__VAES__) + groestl512_4way_init( &x13bcd_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x13bcd_8way_ctx.shavite ); + echo_4way_init( &x13bcd_8way_ctx.echo, 512 ); +#else + init_groestl( &x13bcd_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x13bcd_8way_ctx.shavite ); + init_echo( &x13bcd_8way_ctx.echo, 512 ); +#endif +}; + +void x13bcd_8way_hash( void *state, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + uint64_t vhashA[8*8] __attribute__ ((aligned (64))); + uint64_t vhashB[8*8] __attribute__ ((aligned (64))); + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t hash4[8] __attribute__ ((aligned (64))); + uint64_t hash5[8] __attribute__ ((aligned (64))); + uint64_t hash6[8] __attribute__ ((aligned (64))); + uint64_t hash7[8] __attribute__ ((aligned (64))); + x13bcd_8way_ctx_holder ctx; + memcpy( &ctx, &x13bcd_8way_ctx, sizeof(x13bcd_8way_ctx) ); + + // Blake + memcpy( &ctx.blake, &x13bcd_8way_ctx_mid, sizeof(x13bcd_8way_ctx_mid) ); + blake512_8way_update( &ctx.blake, input + (64<<3), 16 ); + blake512_8way_close( &ctx.blake, vhash ); + + // Bmw + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); + +#endif + + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + + // JH + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + + // Keccak + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + + // SM3 parallel 32 bit + rintrlv_8x64_8x32( vhashA, vhash, 512 ); + memset( vhash, 0, sizeof vhash ); + sm3_8way_update( &ctx.sm3, vhashA, 64 ); + sm3_8way_close( &ctx.sm3, vhash ); + + rintrlv_8x32_4x128( vhashA, vhashB, vhash, 512 ); + + // Cube + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash2, 64 ); + sph_shavite512_close( &ctx.shavite, hash2 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash3, 64 ); + sph_shavite512_close( &ctx.shavite, hash3 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash4, 64 ); + sph_shavite512_close( &ctx.shavite, hash4 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash5, 64 ); + sph_shavite512_close( &ctx.shavite, hash5 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash6, 64 ); + sph_shavite512_close( &ctx.shavite, hash6 ); + memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash7, 64 ); + sph_shavite512_close( &ctx.shavite, hash7 ); + + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash2, + (const BitSequence *) hash2, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash3, + (const BitSequence *) hash3, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash4, + (const BitSequence *) hash4, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash5, + (const BitSequence *) hash5, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash6, + (const BitSequence *) hash6, 512 ); + memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash7, + (const BitSequence *) hash7, 512 ); + + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7 ); + +#endif + + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, + hash4, hash5, hash6, hash7, vhash ); + + // Fugue serial + sph_fugue512( &ctx.fugue, hash0, 64 ); + sph_fugue512_close( &ctx.fugue, state ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash1, 64 ); + sph_fugue512_close( &ctx.fugue, state+32 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash2, 64 ); + sph_fugue512_close( &ctx.fugue, state+64 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash3, 64 ); + sph_fugue512_close( &ctx.fugue, state+96 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash4, 64 ); + sph_fugue512_close( &ctx.fugue, state+128 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash5, 64 ); + sph_fugue512_close( &ctx.fugue, state+160 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash6, 64 ); + sph_fugue512_close( &ctx.fugue, state+192 ); + memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue, + sizeof(sph_fugue512_context) ); + sph_fugue512( &ctx.fugue, hash7, 64 ); + sph_fugue512_close( &ctx.fugue, state+224 ); +} + +int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + int thr_id = mythr->id; // thr_id arg is deprecated + const uint32_t Htarg = ptarget[7]; + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + + blake512_8way_init( &x13bcd_8way_ctx_mid ); + blake512_8way_update( &x13bcd_8way_ctx_mid, vdata, 64 ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + x13bcd_8way_hash( hash, vdata ); + pdata[19] = n; + + for ( int i = 0; i < 8; i++ ) + if ( (hash+(i<<3))[7] <= Htarg ) + if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 8; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + + *hashes_done = n - first_nonce; + return 0; +} + + +#elif defined(X13BCD_4WAY) typedef struct { blake512_4way_context blake; @@ -68,11 +381,11 @@ void x13bcd_4way_hash( void *state, const void *input ) // Blake memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) ); - blake512_4way( &ctx.blake, input + (64<<2), 16 ); + blake512_4way_update( &ctx.blake, input + (64<<2), 16 ); blake512_4way_close( &ctx.blake, vhash ); // Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -91,15 +404,15 @@ void x13bcd_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -118,7 +431,7 @@ void x13bcd_4way_hash( void *state, const void *input ) uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); memset( sm3_hash3, 0, sizeof sm3_hash3 ); - sm3_4way( &ctx.sm3, vhash, 64 ); + sm3_4way_update( &ctx.sm3, vhash, 64 ); sm3_4way_close( &ctx.sm3, sm3_vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); @@ -171,20 +484,23 @@ void x13bcd_4way_hash( void *state, const void *input ) // Hamsi parallel 4x32x2 intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, + sizeof(sph_fugue512_context) ); sph_fugue512( &ctx.fugue, hash1, 64 ); sph_fugue512_close( &ctx.fugue, hash1 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, + sizeof(sph_fugue512_context) ); sph_fugue512( &ctx.fugue, hash2, 64 ); sph_fugue512_close( &ctx.fugue, hash2 ); - memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) ); + memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, + sizeof(sph_fugue512_context) ); sph_fugue512( &ctx.fugue, hash3, 64 ); sph_fugue512_close( &ctx.fugue, hash3 ); @@ -203,44 +519,33 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; mm256_bswap32_intrlv80_4x64( vdata, pdata ); blake512_4way_init( &x13bcd_ctx_mid ); blake512_4way( &x13bcd_ctx_mid, vdata, 64 ); + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - for ( int m=0; m < 6; m++ ) - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + x13bcd_4way_hash( hash, vdata ); + pdata[19] = n; - x13bcd_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) ) - if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } - - *hashes_done = n - first_nonce + 1; + for ( int i = 0; i < 4; i++ ) + if ( (hash+(i<<3))[7] <= Htarg ) + if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 4; + } while ( ( n < last_nonce ) && !work_restart[thr_id].restart ); + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c index a107627..9cafa76 100644 --- a/algo/x13/x13sm3-4way.c +++ b/algo/x13/x13sm3-4way.c @@ -71,13 +71,11 @@ void x13sm3_4way_hash( void *state, const void *input ) // Blake memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) ); - blake512_4way( &ctx.blake, input + (64<<2), 16 ); - -// blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input + (64<<2), 16 ); blake512_4way_close( &ctx.blake, vhash ); // Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -96,15 +94,15 @@ void x13sm3_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // Serial to the end @@ -180,13 +178,13 @@ void x13sm3_4way_hash( void *state, const void *input ) uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); memset( sm3_hash3, 0, sizeof sm3_hash3 ); - sm3_4way( &ctx.sm3, vhash, 64 ); + sm3_4way_update( &ctx.sm3, vhash, 64 ); sm3_4way_close( &ctx.sm3, sm3_vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); // Hamsi parallel 4x32x2 intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c index bc0fb92..fc2f934 100644 --- a/algo/x13/x13sm3-gate.c +++ b/algo/x13/x13sm3-gate.c @@ -17,7 +17,11 @@ bool register_x13sm3_algo( algo_gate_t* gate ) bool register_x13bcd_algo( algo_gate_t* gate ) { -#if defined (X13SM3_4WAY) +#if defined (X13BCD_8WAY) + init_x13bcd_8way_ctx(); + gate->scanhash = (void*)&scanhash_x13bcd_8way; + gate->hash = (void*)&x13bcd_8way_hash; +#elif defined (X13BCD_4WAY) init_x13bcd_4way_ctx(); gate->scanhash = (void*)&scanhash_x13bcd_4way; gate->hash = (void*)&x13bcd_4way_hash; @@ -26,7 +30,7 @@ bool register_x13bcd_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x13bcd; gate->hash = (void*)&x13bcd_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h index f0047bf..fc6154a 100644 --- a/algo/x13/x13sm3-gate.h +++ b/algo/x13/x13sm3-gate.h @@ -5,13 +5,11 @@ #include #if defined(__AVX2__) && defined(__AES__) - #define X13SM3_4WAY + #define X13SM3_4WAY 1 #endif bool register_x13sm3_algo( algo_gate_t* gate ); -bool register_x13bcd_algo( algo_gate_t* gate ); - #if defined(X13SM3_4WAY) void x13sm3_4way_hash( void *state, const void *input ); @@ -19,18 +17,39 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_x13sm3_4way_ctx(); -void x13bcd_4way_hash( void *state, const void *input ); -int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -void init_x13bcd_4way_ctx(); - -#endif +#else void x13sm3_hash( void *state, const void *input ); int scanhash_x13sm3( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_x13sm3_ctx(); +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X13BCD_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) + #define X13BCD_4WAY 1 +#endif + +bool register_x13bcd_algo( algo_gate_t* gate ); + +#if defined(X13BCD_8WAY) + +void x13bcd_8way_hash( void *state, const void *input ); +int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x13bcd_8way_ctx(); + +#elif defined(X13BCD_4WAY) + +void x13bcd_4way_hash( void *state, const void *input ); +int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +void init_x13bcd_4way_ctx(); + +#else + void x13bcd_hash( void *state, const void *input ); int scanhash_x13bcd( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); @@ -38,3 +57,4 @@ void init_x13bcd_ctx(); #endif +#endif diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c index 3e1cc69..09f99b1 100644 --- a/algo/x14/polytimos-4way.c +++ b/algo/x14/polytimos-4way.c @@ -34,14 +34,14 @@ void polytimos_4way_hash( void *output, const void *input ) poly_4way_context_overlay ctx; skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, input, 80 ); + skein512_4way_update( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); // Need to convert from 64 bit interleaved to 32 bit interleaved. uint32_t vhash32[16*4]; rintrlv_4x64_4x32( vhash32, vhash, 512 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash32, 64 ); + shabal512_4way_update( &ctx.shabal, vhash32, 64 ); shabal512_4way_close( &ctx.shabal, vhash32 ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 ); diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c index 4f35161..1f8ea39 100644 --- a/algo/x14/veltor-4way.c +++ b/algo/x14/veltor-4way.c @@ -38,7 +38,7 @@ void veltor_4way_hash( void *output, const void *input ) veltor_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) ); - skein512_4way( &ctx.skein, input, 80 ); + skein512_4way_update( &ctx.skein, input, 80 ); skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -55,7 +55,7 @@ void veltor_4way_hash( void *output, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 9de05d3..70d7c06 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -19,24 +19,35 @@ #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(X14_8WAY) typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } x14_8way_ctx_holder; x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64))); @@ -45,25 +56,31 @@ void init_x14_8way_ctx() { blake512_8way_init( &x14_8way_ctx.blake ); bmw512_8way_init( &x14_8way_ctx.bmw ); - init_groestl( &x14_8way_ctx.groestl, 64 ); skein512_8way_init( &x14_8way_ctx.skein ); jh512_8way_init( &x14_8way_ctx.jh ); keccak512_8way_init( &x14_8way_ctx.keccak ); luffa_4way_init( &x14_8way_ctx.luffa, 512 ); cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x14_8way_ctx.shavite ); simd_4way_init( &x14_8way_ctx.simd, 512 ); - init_echo( &x14_8way_ctx.echo, 512 ); hamsi512_8way_init( &x14_8way_ctx.hamsi ); sph_fugue512_init( &x14_8way_ctx.fugue ); shabal512_8way_init( &x14_8way_ctx.shabal ); +#if defined(__VAES__) + groestl512_4way_init( &x14_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x14_8way_ctx.shavite ); + echo_4way_init( &x14_8way_ctx.echo, 512 ); +#else + init_groestl( &x14_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x14_8way_ctx.shavite ); + init_echo( &x14_8way_ctx.echo, 512 ); +#endif }; void x14_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -80,6 +97,19 @@ void x14_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -108,6 +138,9 @@ void x14_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -117,18 +150,26 @@ void x14_8way_hash( void *state, const void *input ) keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); - - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -161,14 +202,28 @@ void x14_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) ); @@ -195,6 +250,9 @@ void x14_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, @@ -325,11 +383,11 @@ void x14_4way_hash( void *state, const void *input ) memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) ); // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -348,15 +406,15 @@ void x14_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // Serial @@ -420,7 +478,7 @@ void x14_4way_hash( void *state, const void *input ) // 12 Hamsi parallel 4way 32 bit intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c index 851b7c3..d454f79 100644 --- a/algo/x14/x14-gate.c +++ b/algo/x14/x14-gate.c @@ -15,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x14; gate->hash = (void*)&x14hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index a761af0..4af9499 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -20,26 +20,36 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(X15_8WAY) - typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } x15_8way_ctx_holder; x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64))); @@ -48,26 +58,32 @@ void init_x15_8way_ctx() { blake512_8way_init( &x15_8way_ctx.blake ); bmw512_8way_init( &x15_8way_ctx.bmw ); - init_groestl( &x15_8way_ctx.groestl, 64 ); skein512_8way_init( &x15_8way_ctx.skein ); jh512_8way_init( &x15_8way_ctx.jh ); keccak512_8way_init( &x15_8way_ctx.keccak ); luffa_4way_init( &x15_8way_ctx.luffa, 512 ); cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x15_8way_ctx.shavite ); simd_4way_init( &x15_8way_ctx.simd, 512 ); - init_echo( &x15_8way_ctx.echo, 512 ); hamsi512_8way_init( &x15_8way_ctx.hamsi ); sph_fugue512_init( &x15_8way_ctx.fugue ); shabal512_8way_init( &x15_8way_ctx.shabal ); sph_whirlpool_init( &x15_8way_ctx.whirlpool ); +#if defined(__VAES__) + groestl512_4way_init( &x15_8way_ctx.groestl, 64 ); + shavite512_4way_init( &x15_8way_ctx.shavite ); + echo_4way_init( &x15_8way_ctx.echo, 512 ); +#else + init_groestl( &x15_8way_ctx.groestl, 64 ); + sph_shavite512_init( &x15_8way_ctx.shavite ); + init_echo( &x15_8way_ctx.echo, 512 ); +#endif }; void x15_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[4*8] __attribute__ ((aligned (64))); - uint64_t vhash1[4*8] __attribute__ ((aligned (64))); + uint64_t vhashA[4*8] __attribute__ ((aligned (64))); + uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -86,10 +102,22 @@ void x15_8way_hash( void *state, const void *input ) // 2 Bmw bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - // 3 Groestl update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); @@ -108,6 +136,9 @@ void x15_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -118,21 +149,27 @@ void x15_8way_hash( void *state, const void *input ) // 6 Keccak keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); +#if defined(__VAES__) + + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); // 9 Shavite sph_shavite512( &ctx.shavite, hash0, 64 ); @@ -166,16 +203,28 @@ void x15_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); - // 10 Simd - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); - // 11 Echo +#endif + + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + +#if defined(__VAES__) + + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) ); @@ -200,10 +249,11 @@ void x15_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); - - // 12 Hamsi parallel 4way 64 bit intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); + +#endif + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, @@ -374,11 +424,11 @@ void x15_4way_hash( void *state, const void *input ) memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) ); // 1 Blake - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -397,15 +447,15 @@ void x15_4way_hash( void *state, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // 5 JH - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 6 Keccak - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // Serial to the end @@ -469,7 +519,7 @@ void x15_4way_hash( void *state, const void *input ) // 12 Hamsi parallel 4way 32 bit intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c index c148618..5083a36 100644 --- a/algo/x15/x15-gate.c +++ b/algo/x15/x15-gate.c @@ -15,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x15; gate->hash = (void*)&x15hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index d724c78..c4c9dab 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -26,6 +26,11 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; @@ -36,20 +41,26 @@ union _x16r_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; sha512_8way_context sha512; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _x16r_8way_context_overlay x16r_8way_context_overlay; @@ -115,31 +126,42 @@ void x16r_8way_hash( void* output, const void* input ) hash7, vhash ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); - break; +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (const char*)in4, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (const char*)in5, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (const char*)in6, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (const char*)in7, size<<3 ); +#endif + break; case SKEIN: skein512_8way_init( &ctx.skein ); if ( i == 0 ) @@ -203,6 +225,16 @@ void x16r_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case SHAVITE: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in0, size ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -227,7 +259,8 @@ void x16r_8way_hash( void* output, const void* input ) sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); - break; +#endif + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); simd_4way_init( &ctx.simd, 512 ); @@ -239,31 +272,42 @@ void x16r_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); - break; +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash4, + (const BitSequence*)in4, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash5, + (const BitSequence*)in5, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash6, + (const BitSequence*)in6, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash7, + (const BitSequence*)in7, size<<3 ); +#endif + break; case HAMSI: intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); @@ -463,11 +507,11 @@ void x16r_4way_hash( void* output, const void* input ) case BLAKE: blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); + blake512_4way_update( &ctx.blake, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); + blake512_4way_update( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -475,11 +519,11 @@ void x16r_4way_hash( void* output, const void* input ) case BMW: bmw512_4way_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); + bmw512_4way_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); + bmw512_4way_update( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -501,11 +545,11 @@ void x16r_4way_hash( void* output, const void* input ) case SKEIN: skein512_4way_init( &ctx.skein ); if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); + skein512_4way_update( &ctx.skein, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); + skein512_4way_update( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -513,11 +557,11 @@ void x16r_4way_hash( void* output, const void* input ) case JH: jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); + jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -525,11 +569,11 @@ void x16r_4way_hash( void* output, const void* input ) case KECCAK: keccak512_4way_init( &ctx.keccak ); if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); + keccak512_4way_update( &ctx.keccak, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); + keccak512_4way_update( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -599,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input ) case HAMSI: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; @@ -620,7 +664,7 @@ void x16r_4way_hash( void* output, const void* input ) case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); + shabal512_4way_update( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); break; @@ -641,7 +685,7 @@ void x16r_4way_hash( void* output, const void* input ) case SHA_512: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); + sha512_4way_update( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 031379a..6323589 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -44,7 +44,7 @@ bool register_x16r_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -62,7 +62,7 @@ bool register_x16rv2_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rv2; gate->hash = (void*)&x16rv2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16r_getAlgoString; opt_target_factor = 256.0; return true; @@ -80,7 +80,7 @@ bool register_x16s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16r; gate->hash = (void*)&x16r_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; opt_target_factor = 256.0; return true; @@ -215,7 +215,7 @@ bool register_x16rt_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16rt_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; }; @@ -232,7 +232,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x16rt; gate->hash = (void*)&x16rt_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; gate->build_extraheader = (void*)&veil_build_extraheader; opt_target_factor = 256.0; return true; @@ -262,17 +262,20 @@ bool register_x21s_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x21s_8way; gate->hash = (void*)&x21s_8way_hash; gate->miner_thread_init = (void*)&x21s_8way_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT + | VAES_OPT; #elif defined (X21S_4WAY) gate->scanhash = (void*)&scanhash_x21s_4way; gate->hash = (void*)&x21s_4way_hash; gate->miner_thread_init = (void*)&x21s_4way_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #else gate->scanhash = (void*)&scanhash_x21s; gate->hash = (void*)&x21s_hash; gate->miner_thread_init = (void*)&x21s_thread_init; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #endif // gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; x16_r_s_getAlgoString = (void*)&x16s_getAlgoString; diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index 663f61e..56c7b69 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -20,6 +20,11 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; @@ -30,20 +35,26 @@ union _x16rt_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; sha512_8way_context sha512; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay; @@ -109,6 +120,16 @@ void x16rt_8way_hash( void* output, const void* input ) hash7, vhash ); break; case GROESTL: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (const char*)in0, size<<3 ); @@ -133,7 +154,8 @@ void x16rt_8way_hash( void* output, const void* input ) init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (const char*)in7, size<<3 ); - break; +#endif + break; case SKEIN: skein512_8way_init( &ctx.skein ); if ( i == 0 ) @@ -197,6 +219,16 @@ void x16rt_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case SHAVITE: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in0, size ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -221,7 +253,8 @@ void x16rt_8way_hash( void* output, const void* input ) sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); - break; +#endif + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); simd_4way_init( &ctx.simd, 512 ); @@ -233,6 +266,16 @@ void x16rt_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence *)hash0, (const BitSequence*)in0, size<<3 ); @@ -257,7 +300,8 @@ void x16rt_8way_hash( void* output, const void* input ) init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence *)hash7, (const BitSequence*)in7, size<<3 ); - break; +#endif + break; case HAMSI: intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); @@ -458,11 +502,11 @@ void x16rt_4way_hash( void* output, const void* input ) case BLAKE: blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); + blake512_4way_update( &ctx.blake, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); + blake512_4way_update( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -470,11 +514,11 @@ void x16rt_4way_hash( void* output, const void* input ) case BMW: bmw512_4way_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); + bmw512_4way_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); + bmw512_4way_update( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -496,11 +540,11 @@ void x16rt_4way_hash( void* output, const void* input ) case SKEIN: skein512_4way_init( &ctx.skein ); if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); + skein512_4way_update( &ctx.skein, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); + skein512_4way_update( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -508,11 +552,11 @@ void x16rt_4way_hash( void* output, const void* input ) case JH: jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); + jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -520,11 +564,11 @@ void x16rt_4way_hash( void* output, const void* input ) case KECCAK: keccak512_4way_init( &ctx.keccak ); if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); + keccak512_4way_update( &ctx.keccak, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); + keccak512_4way_update( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -594,7 +638,7 @@ void x16rt_4way_hash( void* output, const void* input ) case HAMSI: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -615,7 +659,7 @@ void x16rt_4way_hash( void* output, const void* input ) case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); + shabal512_4way_update( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -636,7 +680,7 @@ void x16rt_4way_hash( void* output, const void* input ) case SHA_512: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); + sha512_4way_update( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index 7406138..f945133 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -27,6 +27,11 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha-hash-4way.h" #include "algo/tiger/sph_tiger.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; @@ -37,21 +42,30 @@ union _x16rv2_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; +// hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; +// sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; +// hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; sha512_8way_context sha512; sph_tiger_context tiger; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay; @@ -117,6 +131,16 @@ void x16rv2_8way_hash( void* output, const void* input ) hash7, vhash ); break; case GROESTL: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (const char*)in0, size<<3 ); @@ -141,7 +165,8 @@ void x16rv2_8way_hash( void* output, const void* input ) init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (const char*)in7, size<<3 ); - break; +#endif + break; case SKEIN: skein512_8way_init( &ctx.skein ); if ( i == 0 ) @@ -258,6 +283,16 @@ void x16rv2_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case SHAVITE: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in0, size ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -282,7 +317,8 @@ void x16rv2_8way_hash( void* output, const void* input ) sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); - break; +#endif + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); simd_4way_init( &ctx.simd, 512 ); @@ -294,6 +330,16 @@ void x16rv2_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence *)hash0, (const BitSequence*)in0, size<<3 ); @@ -318,7 +364,8 @@ void x16rv2_8way_hash( void* output, const void* input ) init_echo( &ctx.echo, 512 ); update_final_echo ( &ctx.echo, (BitSequence *)hash7, (const BitSequence*)in7, size<<3 ); - break; +#endif + break; case HAMSI: intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); @@ -553,11 +600,11 @@ void x16rv2_4way_hash( void* output, const void* input ) case BLAKE: blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); + blake512_4way_update( &ctx.blake, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); + blake512_4way_update( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -565,11 +612,11 @@ void x16rv2_4way_hash( void* output, const void* input ) case BMW: bmw512_4way_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); + bmw512_4way_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); + bmw512_4way_update( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -591,11 +638,11 @@ void x16rv2_4way_hash( void* output, const void* input ) case SKEIN: skein512_4way_init( &ctx.skein ); if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); + skein512_4way_update( &ctx.skein, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); + skein512_4way_update( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -603,11 +650,11 @@ void x16rv2_4way_hash( void* output, const void* input ) case JH: jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); + jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -631,7 +678,7 @@ void x16rv2_4way_hash( void* output, const void* input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -721,7 +768,7 @@ void x16rv2_4way_hash( void* output, const void* input ) case HAMSI: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -742,7 +789,7 @@ void x16rv2_4way_hash( void* output, const void* input ) case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); + shabal512_4way_update( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -779,7 +826,7 @@ void x16rv2_4way_hash( void* output, const void* input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 7d98a00..69773d7 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -30,6 +30,11 @@ #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" #include "algo/lyra2/lyra2.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(__SHA__) #include #endif @@ -45,15 +50,12 @@ union _x21s_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; @@ -63,6 +65,15 @@ union _x21s_8way_context_overlay sph_tiger_context tiger; sph_gost512_context gost; sha256_8way_context sha256; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _x21s_8way_context_overlay x21s_8way_context_overlay; @@ -128,31 +139,42 @@ void x21s_8way_hash( void* output, const void* input ) hash7, vhash ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); - break; +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4, + (const char*)in4, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5, + (const char*)in5, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6, + (const char*)in6, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7, + (const char*)in7, size<<3 ); +#endif + break; case SKEIN: skein512_8way_init( &ctx.skein ); if ( i == 0 ) @@ -216,6 +238,16 @@ void x21s_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case SHAVITE: +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in0, size ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -240,7 +272,8 @@ void x21s_8way_hash( void* output, const void* input ) sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); - break; +#endif + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); simd_4way_init( &ctx.simd, 512 ); @@ -252,31 +285,43 @@ void x21s_8way_hash( void* output, const void* input ) dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); - break; + +#if defined(__VAES__) + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhash, vhash, 512 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); +#else + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash4, + (const BitSequence*)in4, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash5, + (const BitSequence*)in5, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash6, + (const BitSequence*)in6, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash7, + (const BitSequence*)in7, size<<3 ); +#endif + break; case HAMSI: intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); @@ -578,11 +623,11 @@ void x21s_4way_hash( void* output, const void* input ) case BLAKE: blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way( &ctx.blake, input, size ); + blake512_4way_update( &ctx.blake, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way( &ctx.blake, vhash, size ); + blake512_4way_update( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -590,11 +635,11 @@ void x21s_4way_hash( void* output, const void* input ) case BMW: bmw512_4way_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way( &ctx.bmw, input, size ); + bmw512_4way_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way( &ctx.bmw, vhash, size ); + bmw512_4way_update( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -616,11 +661,11 @@ void x21s_4way_hash( void* output, const void* input ) case SKEIN: skein512_4way_init( &ctx.skein ); if ( i == 0 ) - skein512_4way( &ctx.skein, input, size ); + skein512_4way_update( &ctx.skein, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way( &ctx.skein, vhash, size ); + skein512_4way_update( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -628,11 +673,11 @@ void x21s_4way_hash( void* output, const void* input ) case JH: jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way( &ctx.jh, vhash, size ); + jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -640,11 +685,11 @@ void x21s_4way_hash( void* output, const void* input ) case KECCAK: keccak512_4way_init( &ctx.keccak ); if ( i == 0 ) - keccak512_4way( &ctx.keccak, input, size ); + keccak512_4way_update( &ctx.keccak, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way( &ctx.keccak, vhash, size ); + keccak512_4way_update( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -714,7 +759,7 @@ void x21s_4way_hash( void* output, const void* input ) case HAMSI: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, size ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -735,7 +780,7 @@ void x21s_4way_hash( void* output, const void* input ) case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, size ); + shabal512_4way_update( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -756,7 +801,7 @@ void x21s_4way_hash( void* output, const void* input ) case SHA_512: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, size ); + sha512_4way_update( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; @@ -767,7 +812,7 @@ void x21s_4way_hash( void* output, const void* input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhash, 64 ); + haval256_5_4way_update( &ctx.haval, vhash, 64 ); haval256_5_4way_close( &ctx.haval, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -831,7 +876,7 @@ void x21s_4way_hash( void* output, const void* input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); sha256_4way_init( &ctx.sha256 ); - sha256_4way( &ctx.sha256, vhash, 64 ); + sha256_4way_update( &ctx.sha256, vhash, 64 ); sha256_4way_close( &ctx.sha256, vhash ); dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 ); diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index 3a0b248..f28da43 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -21,6 +21,11 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" #include "algo/sha/sha-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(SONOA_8WAY) @@ -28,21 +33,27 @@ union _sonoa_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; sph_whirlpool_context whirlpool; sha512_8way_context sha512; haval256_5_8way_context haval; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _sonoa_8way_context_overlay sonoa_8way_context_overlay; @@ -72,6 +83,19 @@ void sonoa_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -95,6 +119,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -119,6 +145,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -150,11 +185,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -182,16 +230,31 @@ void sonoa_8way_hash( void *state, const void *input ) init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); - -// 2 intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + +// 2 + bmw512_8way_init( &ctx.bmw ); bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -215,6 +278,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -239,6 +304,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -270,11 +344,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -306,6 +393,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -316,6 +405,19 @@ void sonoa_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -339,6 +441,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -363,6 +467,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -394,11 +507,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -430,6 +556,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -471,6 +599,19 @@ void sonoa_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -494,6 +635,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -518,6 +661,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -549,11 +701,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -585,6 +750,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -630,6 +797,17 @@ void sonoa_8way_hash( void *state, const void *input ) hamsi512_8way_update( &ctx.hamsi, vhashA, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -658,6 +836,18 @@ void sonoa_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); +#endif + +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, hash0, 64 ); @@ -684,11 +874,13 @@ void sonoa_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); -// 5 - intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + +// 5 + bmw512_8way_init( &ctx.bmw ); bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); @@ -699,6 +891,19 @@ void sonoa_8way_hash( void *state, const void *input ) shabal512_8way_update( &ctx.shabal, vhashA, 64 ); shabal512_8way_close( &ctx.shabal, vhash ); +#if defined(__VAES__) + + rintrlv_8x32_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -722,6 +927,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -746,6 +953,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -777,14 +993,27 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - + init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); @@ -813,6 +1042,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -889,6 +1120,19 @@ void sonoa_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -912,6 +1156,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -936,6 +1182,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -967,11 +1222,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -1003,6 +1271,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -1114,6 +1384,19 @@ void sonoa_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -1137,6 +1420,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -1161,6 +1446,15 @@ void sonoa_8way_hash( void *state, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -1192,11 +1486,24 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -1228,6 +1535,8 @@ void sonoa_8way_hash( void *state, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -1319,7 +1628,7 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce, uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 8; + const uint32_t last_nonce = max_nonce - 8; __m512i *noncev = (__m512i*)vdata + 9; // aligned uint32_t n = first_nonce; const int thr_id = mythr->id; @@ -1350,8 +1659,6 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce, return 0; } - - #elif defined(SONOA_4WAY) union _sonoa_4way_context_overlay @@ -1391,11 +1698,11 @@ void sonoa_4way_hash( void *state, const void *input ) // 1 blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1412,15 +1719,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1466,7 +1773,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1483,15 +1790,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1535,13 +1842,13 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); // 3 bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1558,15 +1865,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1610,7 +1917,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1632,7 +1939,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1649,15 +1956,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1701,7 +2008,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1722,13 +2029,13 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); rintrlv_4x32_4x64( vhashB, vhash, 512 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhashB, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhashB, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1758,13 +2065,13 @@ void sonoa_4way_hash( void *state, const void *input ) rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); rintrlv_4x64_4x32( vhashB, vhash, 512 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhashB, 64 ); + shabal512_4way_update( &ctx.shabal, vhashB, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1781,15 +2088,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1833,7 +2140,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1854,7 +2161,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1877,7 +2184,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1894,15 +2201,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1946,7 +2253,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1967,7 +2274,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1988,7 +2295,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -2011,7 +2318,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -2028,15 +2335,15 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -2080,7 +2387,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -2101,7 +2408,7 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -2122,13 +2429,13 @@ void sonoa_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashB, vhash, 512 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashB, 64 ); + haval256_5_4way_update( &ctx.haval, vhashB, 64 ); haval256_5_4way_close( &ctx.haval, state ); } diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c index 3687733..7dce68f 100644 --- a/algo/x17/sonoa-gate.c +++ b/algo/x17/sonoa-gate.c @@ -13,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_sonoa; gate->hash = (void*)&sonoa_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 18eed41..83d4712 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -13,6 +13,11 @@ #include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/shavite/shavite-hash-2way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -28,15 +33,21 @@ union _x17_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; sph_shavite512_context shavite; - simd_4way_context simd; hashState_echo echo; +#endif + simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; @@ -49,8 +60,8 @@ typedef union _x17_8way_context_overlay x17_8way_context_overlay; void x17_8way_hash( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhash0[8*8] __attribute__ ((aligned (64))); - uint64_t vhash1[8*8] __attribute__ ((aligned (64))); + uint64_t vhashA[8*8] __attribute__ ((aligned (64))); + uint64_t vhashB[8*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -61,7 +72,7 @@ void x17_8way_hash( void *state, const void *input ) uint64_t hash7[8] __attribute__ ((aligned (64))); x17_8way_context_overlay ctx; - // 1 Blake parallel 4 way 64 bit + // 1 Blake blake512_8way_init( &ctx.blake ); blake512_8way_update( &ctx.blake, input, 80 ); blake512_8way_close( &ctx.blake, vhash ); @@ -71,11 +82,24 @@ void x17_8way_hash( void *state, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - // Serialize + // 3 Groestl + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - // 3 Groestl init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); init_groestl( &ctx.groestl, 64 ); @@ -93,10 +117,11 @@ void x17_8way_hash( void *state, const void *input ) init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); - // Parallellize intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + // 4 Skein parallel 4 way 64 bit skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); @@ -112,24 +137,34 @@ void x17_8way_hash( void *state, const void *input ) keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); - rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 ); + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); // 7 Luffa luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); // 8 Cubehash cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 ); - - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); // 9 Shavite + +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); + sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, hash0, 64 ); sph_shavite512_close( &ctx.shavite, hash0 ); @@ -155,18 +190,33 @@ void x17_8way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash7, 64 ); sph_shavite512_close( &ctx.shavite, hash7 ); + intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); + intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); + +#endif + // 10 Simd - intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 ); simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, 512 ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + // 11 Echo + +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - // 11 Echo serial init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); @@ -192,9 +242,11 @@ void x17_8way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash7, (const BitSequence *) hash7, 512 ); - // 12 Hamsi parallel 4 way 64 bit intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + + // 12 Hamsi hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); @@ -229,7 +281,7 @@ void x17_8way_hash( void *state, const void *input ) sph_fugue512( &ctx.fugue, hash7, 64 ); sph_fugue512_close( &ctx.fugue, hash7 ); - // 14 Shabal, parallel 4 way 32 bit + // 14 Shabal, parallel 8 way 32 bit intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -275,10 +327,10 @@ void x17_8way_hash( void *state, const void *input ) sha512_8way_close( &ctx.sha512, vhash ); // 17 Haval parallel 32 bit - rintrlv_8x64_8x32( vhash0, vhash, 512 ); + rintrlv_8x64_8x32( vhashA, vhash, 512 ); haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhash0, 64 ); + haval256_5_8way_update( &ctx.haval, vhashA, 64 ); haval256_5_8way_close( &ctx.haval, state ); } @@ -292,7 +344,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce, uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce - 8; + const uint32_t last_nonce = max_nonce - 8; __m512i *noncev = (__m512i*)vdata + 9; // aligned uint32_t n = first_nonce; const int thr_id = mythr->id; @@ -349,23 +401,23 @@ typedef union _x17_4way_context_overlay x17_4way_context_overlay; void x17_4way_hash( void *state, const void *input ) { + uint64_t vhash[8*4] __attribute__ ((aligned (128))); + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); - uint64_t vhash[8*4] __attribute__ ((aligned (64))); - uint64_t vhashA[8*4] __attribute__ ((aligned (64))); - uint64_t vhashB[8*4] __attribute__ ((aligned (64))); x17_4way_context_overlay ctx; // 1 Blake parallel 4 way 64 bit blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); // 2 Bmw bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); // Serialize @@ -386,17 +438,17 @@ void x17_4way_hash( void *state, const void *input ) // 4 Skein parallel 4 way 64 bit skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); // 5 JH jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); // 6 Keccak keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); // 7 Luffa parallel 2 way 128 bit @@ -428,7 +480,6 @@ void x17_4way_hash( void *state, const void *input ) dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); - // 11 Echo serial init_echo( &ctx.echo, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -447,7 +498,7 @@ void x17_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -470,7 +521,7 @@ void x17_4way_hash( void *state, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -493,27 +544,28 @@ void x17_4way_hash( void *state, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); // 17 Haval parallel 32 bit rintrlv_4x64_4x32( vhashB, vhash, 512 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashB, 64 ); + haval256_5_4way_update( &ctx.haval, vhashB, 64 ); haval256_5_4way_close( &ctx.haval, state ); } int scanhash_x17_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*16] __attribute__ ((aligned (64))); + uint32_t hash[16*4] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[7<<2]); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce -4; __m256i *noncev = (__m256i*)vdata + 9; // aligned uint32_t n = first_nonce; const int thr_id = mythr->id; @@ -537,9 +589,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce, } } n += 4; - } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) ); + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); - *hashes_done = n - first_nonce + 1; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index 73ce607..0bad7a2 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -12,7 +12,7 @@ bool register_x17_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x17; gate->hash = (void*)&x17_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; return true; }; diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 28bc1c2..ced4a31 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -22,6 +22,11 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha-hash-4way.h" #include "algo/haval/haval-hash-4way.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(XEVAN_8WAY) @@ -29,13 +34,11 @@ union _xevan_8way_context_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; hashState_echo echo; hamsi512_8way_context hamsi; @@ -44,6 +47,15 @@ union _xevan_8way_context_overlay sph_whirlpool_context whirlpool; sha512_8way_context sha512; haval256_5_8way_context haval; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; +// echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; +// hashState_echo echo; +#endif } __attribute__ ((aligned (64))); typedef union _xevan_8way_context_overlay xevan_8way_context_overlay; @@ -72,6 +84,19 @@ void xevan_8way_hash( void *output, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, dataLen ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -103,6 +128,8 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, dataLen ); skein512_8way_close( &ctx.skein, vhash ); @@ -127,6 +154,15 @@ void xevan_8way_hash( void *output, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); + +#else + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -158,11 +194,26 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); +/* +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else +*/ + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -194,6 +245,8 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); +//#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -286,6 +339,19 @@ void xevan_8way_hash( void *output, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, dataLen ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else + dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -317,6 +383,8 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, dataLen ); skein512_8way_close( &ctx.skein, vhash ); @@ -341,6 +409,15 @@ void xevan_8way_hash( void *output, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); + +#else + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -372,11 +449,27 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); + +/* +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); + +#else +*/ + dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -408,6 +501,8 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); +//#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -569,12 +664,12 @@ void xevan_4way_hash( void *output, const void *input ) // parallel 4 way blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close(&ctx.blake, vhash); memset( &vhash[8<<2], 0, 64<<2 ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, dataLen ); + bmw512_4way_update( &ctx.bmw, vhash, dataLen ); bmw512_4way_close( &ctx.bmw, vhash ); // Serial @@ -597,15 +692,15 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, dataLen ); + skein512_4way_update( &ctx.skein, vhash, dataLen ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, dataLen ); + jh512_4way_update( &ctx.jh, vhash, dataLen ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, dataLen ); + keccak512_4way_update( &ctx.keccak, vhash, dataLen ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -649,7 +744,7 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, dataLen ); + hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -671,7 +766,7 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, dataLen ); + shabal512_4way_update( &ctx.shabal, vhash, dataLen ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -693,13 +788,13 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, dataLen ); + sha512_4way_update( &ctx.sha512, vhash, dataLen ); sha512_4way_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashA, dataLen ); + haval256_5_4way_update( &ctx.haval, vhashA, dataLen ); haval256_5_4way_close( &ctx.haval, vhashA ); rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 ); @@ -707,11 +802,11 @@ void xevan_4way_hash( void *output, const void *input ) memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 ); blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, vhash, dataLen ); + blake512_4way_update( &ctx.blake, vhash, dataLen ); blake512_4way_close(&ctx.blake, vhash); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, dataLen ); + bmw512_4way_update( &ctx.bmw, vhash, dataLen ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -732,15 +827,15 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, dataLen ); + skein512_4way_update( &ctx.skein, vhash, dataLen ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, dataLen ); + jh512_4way_update( &ctx.jh, vhash, dataLen ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, dataLen ); + keccak512_4way_update( &ctx.keccak, vhash, dataLen ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -784,7 +879,7 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, dataLen ); + hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -805,7 +900,7 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, dataLen ); + shabal512_4way_update( &ctx.shabal, vhash, dataLen ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -826,13 +921,13 @@ void xevan_4way_hash( void *output, const void *input ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, dataLen ); + sha512_4way_update( &ctx.sha512, vhash, dataLen ); sha512_4way_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashA, dataLen ); + haval256_5_4way_update( &ctx.haval, vhashA, dataLen ); haval256_5_4way_close( &ctx.haval, output ); } diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c index 8cb86a4..d3e65f7 100644 --- a/algo/x17/xevan-gate.c +++ b/algo/x17/xevan-gate.c @@ -13,7 +13,7 @@ bool register_xevan_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_xevan; gate->hash = (void*)&xevan_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; }; diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 0d28285..5d912b0 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -22,6 +22,11 @@ #include "algo/lyra2/lyra2.h" #include "algo/gost/sph_gost.h" #include "algo/swifftx/swifftx.h" +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif #if defined(X22I_8WAY) @@ -30,14 +35,11 @@ union _x22i_8way_ctx_overlay { blake512_8way_context blake; bmw512_8way_context bmw; - hashState_groestl groestl; - hashState_echo echo; skein512_8way_context skein; jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; cube_4way_context cube; - sph_shavite512_context shavite; simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; @@ -48,6 +50,15 @@ union _x22i_8way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; sha256_8way_context sha256; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif }; typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay; @@ -84,6 +95,19 @@ void x22i_8way_hash( void *output, const void *input ) bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -115,6 +139,8 @@ void x22i_8way_hash( void *output, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); @@ -139,6 +165,15 @@ void x22i_8way_hash( void *output, const void *input ) cube_4way_init( &ctx.cube, 512, 16, 32 ); cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -170,11 +205,24 @@ void x22i_8way_hash( void *output, const void *input ) intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 ); intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 ); +#endif + simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + +#else + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); @@ -206,6 +254,8 @@ void x22i_8way_hash( void *output, const void *input ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); +#endif + hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); hamsi512_8way_close( &ctx.hamsi, vhash ); @@ -439,10 +489,8 @@ int scanhash_x22i_8way( struct work* work, uint32_t max_nonce, return 0; } - #elif defined(X22I_4WAY) - union _x22i_4way_ctx_overlay { blake512_4way_context blake; @@ -477,8 +525,6 @@ void x22i_4way_hash( void *output, const void *input ) uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhashA[8*4] __attribute__ ((aligned (64))); uint64_t vhashB[8*4] __attribute__ ((aligned (64))); - -// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0}; unsigned char hashA0[64] __attribute__((aligned(64))) = {0}; unsigned char hashA1[64] __attribute__((aligned(32))) = {0}; unsigned char hashA2[64] __attribute__((aligned(32))) = {0}; @@ -486,13 +532,12 @@ void x22i_4way_hash( void *output, const void *input ) x22i_ctx_overlay ctx; blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); init_groestl( &ctx.groestl, 64 ); @@ -511,15 +556,15 @@ void x22i_4way_hash( void *output, const void *input ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -560,13 +605,11 @@ void x22i_4way_hash( void *output, const void *input ) update_final_echo ( &ctx.echo, (BitSequence*)hash3, (const BitSequence*)hash3, 512 ); - intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); sph_fugue512_init( &ctx.fugue ); @@ -585,9 +628,8 @@ void x22i_4way_hash( void *output, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash ); sph_whirlpool_init( &ctx.whirlpool ); @@ -606,12 +648,10 @@ void x22i_4way_hash( void *output, const void *input ) intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash ); -// InitializeSWIFFTX(); ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0); ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1); ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2); @@ -622,9 +662,8 @@ void x22i_4way_hash( void *output, const void *input ) memset( vhash, 0, 64*4 ); haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashA, 64 ); + haval256_5_4way_update( &ctx.haval, vhashA, 64 ); haval256_5_4way_close( &ctx.haval, vhash ); - dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); memset( hashA0, 0, 64 ); @@ -675,10 +714,8 @@ void x22i_4way_hash( void *output, const void *input ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); sha256_4way_init( &ctx.sha256 ); - sha256_4way( &ctx.sha256, vhash, 64 ); + sha256_4way_update( &ctx.sha256, vhash, 64 ); sha256_4way_close( &ctx.sha256, output ); - -// memcpy(output, hash, 32); } diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index 893a0e3..78f23b4 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -1,19 +1,26 @@ #include "x22i-gate.h" +// Ryzen has poor AVX2 performance so use SHA over AVX2. +// Intel has AVX512 so use AVX512 over SHA. +// When Ryzen AVX2 improves use AVX2 over SHA. + bool register_x22i_algo( algo_gate_t* gate ) { #if defined (X22I_8WAY) gate->scanhash = (void*)&scanhash_x22i_8way; gate->hash = (void*)&x22i_8way_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT + | AVX512_OPT | VAES_OPT; #elif defined (X22I_4WAY) gate->scanhash = (void*)&scanhash_x22i_4way; gate->hash = (void*)&x22i_4way_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #else gate->scanhash = (void*)&scanhash_x22i; gate->hash = (void*)&x22i_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #endif return true; }; @@ -23,17 +30,20 @@ bool register_x25x_algo( algo_gate_t* gate ) #if defined (X25X_8WAY) gate->scanhash = (void*)&scanhash_x25x_8way; gate->hash = (void*)&x25x_8way_hash; -// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT + | AVX512_OPT | VAES_OPT; #elif defined (X25X_4WAY) gate->scanhash = (void*)&scanhash_x25x_4way; gate->hash = (void*)&x25x_4way_hash; -// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #else gate->scanhash = (void*)&scanhash_x25x; gate->hash = (void*)&x25x_hash; -// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT + | AVX512_OPT | VAES_OPT; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT; +// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT; return true; }; diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h index 1dbb305..a03079f 100644 --- a/algo/x22/x22i-gate.h +++ b/algo/x22/x22i-gate.h @@ -34,13 +34,9 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce, #endif - -// Big problems with x25x 8 way. It blows up just by increasing the -// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays. - -//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// #define X25X_8WAY 1 -#if defined(__AVX2__) && defined(__AES__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define X25X_8WAY 1 +#elif defined(__AVX2__) && defined(__AES__) #define X25X_4WAY 1 #endif diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index df8f312..fbbb1fd 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -1,7 +1,4 @@ #include "x22i-gate.h" - -#if defined(X25X_4WAY) - #include "algo/blake/blake-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/skein/skein-hash-4way.h" @@ -16,8 +13,11 @@ #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/fugue/sph_fugue.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/tiger/sph_tiger.h" @@ -26,33 +26,11 @@ #include "algo/swifftx/swifftx.h" #include "algo/panama/sph_panama.h" #include "algo/lanehash/lane.h" - -union _x25x_4way_ctx_overlay -{ - blake512_4way_context blake; - bmw512_4way_context bmw; - hashState_groestl groestl; - hashState_echo echo; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - hamsi512_4way_context hamsi; - sph_fugue512_context fugue; - shabal512_4way_context shabal; - sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; - sph_tiger_context tiger; - sph_gost512_context gost; - sha256_4way_context sha256; - sph_panama_context panama; - blake2s_4way_state blake2s; -}; -typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay; +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif void x25x_shuffle( void *hash ) { @@ -81,28 +59,587 @@ void x25x_shuffle( void *hash ) #undef X25X_SHUFFLE_ROUNDS } -void x25x_4way_hash( void *output, const void *input ) +#if defined(X25X_8WAY) + +union _x25x_8way_ctx_overlay { + blake512_8way_context blake; + bmw512_8way_context bmw; + skein512_8way_context skein; + jh512_8way_context jh; + keccak512_8way_context keccak; + luffa_4way_context luffa; + cube_4way_context cube; + simd_4way_context simd; + hamsi512_8way_context hamsi; + sph_fugue512_context fugue; + shabal512_8way_context shabal; + sph_whirlpool_context whirlpool; + sha512_8way_context sha512; + haval256_5_8way_context haval; + sph_tiger_context tiger; + sph_gost512_context gost; + sha256_8way_context sha256; + sph_panama_context panama; + blake2s_8way_state blake2s; +#if defined(__VAES__) + groestl512_4way_context groestl; + shavite512_4way_context shavite; + echo_4way_context echo; +#else + hashState_groestl groestl; + sph_shavite512_context shavite; + hashState_echo echo; +#endif +}; +typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay; + +void x25x_8way_hash( void *output, const void *input ) +{ + uint64_t vhash[8*8] __attribute__ ((aligned (128))); + unsigned char hash0[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash1[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash2[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash3[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash4[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash5[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash6[25][64] __attribute__((aligned(64))) = {0}; + unsigned char hash7[25][64] __attribute__((aligned(64))) = {0}; + unsigned char vhashX[24][64*8] __attribute__ ((aligned (64))); + uint64_t vhashA[8*8] __attribute__ ((aligned (64))); + uint64_t vhashB[8*8] __attribute__ ((aligned (64))); + x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64))); + + blake512_8way_init( &ctx.blake ); + blake512_8way_update( &ctx.blake, input, 80 ); + blake512_8way_close( &ctx.blake, vhash ); + dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0], + hash4[0], hash5[0], hash6[0], hash7[0], vhash ); + + bmw512_8way_init( &ctx.bmw ); + bmw512_8way_update( &ctx.bmw, vhash, 64 ); + bmw512_8way_close( &ctx.bmw, vhash ); + dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1], + hash4[1], hash5[1], hash6[1], hash7[1], vhash ); + +#if defined(__VAES__) + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); + groestl512_4way_init( &ctx.groestl, 64 ); + groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); + dintrlv_4x128_512( hash0[2], hash1[2], hash2[2], hash3[2], vhashA ); + dintrlv_4x128_512( hash4[2], hash5[2], hash6[2], hash7[2], vhashB ); + + intrlv_8x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2], + hash4[2], hash5[2], hash6[2], hash7[2] ); + +#else + + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0[2], + (const char*)hash0[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1[2], + (const char*)hash1[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2[2], + (const char*)hash2[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3[2], + (const char*)hash3[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash4[2], + (const char*)hash4[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash5[2], + (const char*)hash5[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash6[2], + (const char*)hash6[1], 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash7[2], + (const char*)hash7[1], 512 ); + + intrlv_8x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2], + hash4[2], hash5[2], hash6[2], hash7[2] ); + +#endif + + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, 64 ); + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3], + hash4[3], hash5[3], hash6[3], hash7[3], vhash ); + + jh512_8way_init( &ctx.jh ); + jh512_8way_update( &ctx.jh, vhash, 64 ); + jh512_8way_close( &ctx.jh, vhash ); + dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4], + hash4[4], hash5[4], hash6[4], hash7[4], vhash ); + + keccak512_8way_init( &ctx.keccak ); + keccak512_8way_update( &ctx.keccak, vhash, 64 ); + keccak512_8way_close( &ctx.keccak, vhash ); + dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5], + hash4[5], hash5[5], hash6[5], hash7[5], vhash ); + + rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); + + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); + luffa_4way_init( &ctx.luffa, 512 ); + luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + dintrlv_4x128_512( hash0[6], hash1[6], hash2[6], hash3[6], vhashA ); + dintrlv_4x128_512( hash4[6], hash5[6], hash6[6], hash7[6], vhashB ); + + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); + cube_4way_init( &ctx.cube, 512, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + dintrlv_4x128_512( hash0[7], hash1[7], hash2[7], hash3[7], vhashA ); + dintrlv_4x128_512( hash4[7], hash5[7], hash6[7], hash7[7], vhashB ); + +#if defined(__VAES__) + + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + dintrlv_4x128_512( hash0[8], hash1[8], hash2[8], hash3[8], vhashA ); + dintrlv_4x128_512( hash4[8], hash5[8], hash6[8], hash7[8], vhashB ); + +#else + + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64); + sph_shavite512_close(&ctx.shavite, hash0[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64); + sph_shavite512_close(&ctx.shavite, hash1[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64); + sph_shavite512_close(&ctx.shavite, hash2[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64); + sph_shavite512_close(&ctx.shavite, hash3[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash4[7], 64); + sph_shavite512_close(&ctx.shavite, hash4[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash5[7], 64); + sph_shavite512_close(&ctx.shavite, hash5[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash6[7], 64); + sph_shavite512_close(&ctx.shavite, hash6[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash7[7], 64); + sph_shavite512_close(&ctx.shavite, hash7[8]); + intrlv_4x128_512( vhashA, hash0[8], hash1[8], hash2[8], hash3[8] ); + intrlv_4x128_512( vhashB, hash4[8], hash5[8], hash6[8], hash7[8] ); + +#endif + + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); + simd_4way_init( &ctx.simd, 512 ); + simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + dintrlv_4x128_512( hash0[9], hash1[9], hash2[9], hash3[9], vhashA ); + dintrlv_4x128_512( hash4[9], hash5[9], hash6[9], hash7[9], vhashB ); + +#if defined(__VAES__) + + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + dintrlv_4x128_512( hash0[10], hash1[10], hash2[10], hash3[10], vhashA ); + dintrlv_4x128_512( hash4[10], hash5[10], hash6[10], hash7[10], vhashB ); + + intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10], + hash4[10], hash5[10], hash6[10], hash7[10] ); + +#else + + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash0[10], + (const BitSequence*)hash0[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash1[10], + (const BitSequence*)hash1[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash2[10], + (const BitSequence*)hash2[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash3[10], + (const BitSequence*)hash3[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash4[10], + (const BitSequence*)hash4[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash5[10], + (const BitSequence*)hash5[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash6[10], + (const BitSequence*)hash6[9], 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash7[10], + (const BitSequence*)hash7[9], 512 ); + intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10], + hash4[10], hash5[10], hash6[10], hash7[10] ); + +#endif + + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11], + hash4[11], hash5[11], hash6[11], hash7[11], vhash ); + + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64); + sph_fugue512_close(&ctx.fugue, hash0[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64); + sph_fugue512_close(&ctx.fugue, hash1[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64); + sph_fugue512_close(&ctx.fugue, hash2[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64); + sph_fugue512_close(&ctx.fugue, hash3[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash4[11], 64); + sph_fugue512_close(&ctx.fugue, hash4[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash5[11], 64); + sph_fugue512_close(&ctx.fugue, hash5[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash6[11], 64); + sph_fugue512_close(&ctx.fugue, hash6[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash7[11], 64); + sph_fugue512_close(&ctx.fugue, hash7[12]); + intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12], + hash4[12], hash5[12], hash6[12], hash7[12] ); + + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, 64 ); + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13], + hash4[13], hash5[13], hash6[13], hash7[13], vhash ); + + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash0[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash1[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash2[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash3[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash4[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash4[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash5[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash5[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash6[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash6[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash7[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash7[14]); + intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14], + hash4[14], hash5[14], hash6[14], hash7[14] ); + + sha512_8way_init( &ctx.sha512 ); + sha512_8way_update( &ctx.sha512, vhash, 64 ); + sha512_8way_close( &ctx.sha512, vhash ); + dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15], + hash4[15], hash5[15], hash6[15], hash7[15], vhash ); + + ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]); + ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]); + ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]); + ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]); + ComputeSingleSWIFFTX((unsigned char*)hash4[12], (unsigned char*)hash4[16]); + ComputeSingleSWIFFTX((unsigned char*)hash5[12], (unsigned char*)hash5[16]); + ComputeSingleSWIFFTX((unsigned char*)hash6[12], (unsigned char*)hash6[16]); + ComputeSingleSWIFFTX((unsigned char*)hash7[12], (unsigned char*)hash7[16]); + intrlv_8x32_512( vhashA, hash0[16], hash1[16], hash2[16], hash3[16], + hash4[16], hash5[16], hash6[16], hash7[16] ); + memset( vhash, 0, 64*8 ); + + haval256_5_8way_init( &ctx.haval ); + haval256_5_8way_update( &ctx.haval, vhashA, 64 ); + haval256_5_8way_close( &ctx.haval, vhash ); + dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17], + hash4[17], hash5[17], hash6[17], hash7[17], vhash ); + + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash0[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash0[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash1[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash1[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash2[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash2[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash3[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash3[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash4[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash4[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash5[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash5[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash6[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash6[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash7[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash7[18]); + + intrlv_2x256( vhash, hash0[18], hash1[18], 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash0[19], hash1[19], vhash, 256 ); + intrlv_2x256( vhash, hash2[18], hash3[18], 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash2[19], hash3[19], vhash, 256 ); + intrlv_2x256( vhash, hash4[18], hash5[18], 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash4[19], hash5[19], vhash, 256 ); + intrlv_2x256( vhash, hash6[18], hash7[18], 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + dintrlv_2x256( hash6[19], hash7[19], vhash, 256 ); + + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash0[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash0[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash1[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash1[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash2[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash2[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash3[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash3[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash4[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash4[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash5[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash5[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash6[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash6[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash7[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash7[20]); + intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20], + hash4[20], hash5[20], hash6[20], hash7[20] ); + + sha256_8way_init( &ctx.sha256 ); + sha256_8way_update( &ctx.sha256, vhashA, 64 ); + sha256_8way_close( &ctx.sha256, vhash ); + dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21], + hash4[21], hash5[21], hash6[21], hash7[21], vhash ); + + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash0[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash0[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash1[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash1[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash2[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash2[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash3[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash3[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash4[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash4[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash5[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash5[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash6[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash6[22]); + sph_panama_init(&ctx.panama); + sph_panama (&ctx.panama, (const void*) hash7[21], 64 ); + sph_panama_close(&ctx.panama, (void*) hash7[22]); + + laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]); + laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]); + laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]); + laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]); + laneHash(512, (const BitSequence*)hash4[22], 512, (BitSequence*)hash4[23]); + laneHash(512, (const BitSequence*)hash5[22], 512, (BitSequence*)hash5[23]); + laneHash(512, (const BitSequence*)hash6[22], 512, (BitSequence*)hash6[23]); + laneHash(512, (const BitSequence*)hash7[22], 512, (BitSequence*)hash7[23]); + + x25x_shuffle( hash0 ); + x25x_shuffle( hash1 ); + x25x_shuffle( hash2 ); + x25x_shuffle( hash3 ); + x25x_shuffle( hash4 ); + x25x_shuffle( hash5 ); + x25x_shuffle( hash6 ); + x25x_shuffle( hash7 ); + + intrlv_8x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0], + hash4[ 0], hash5[ 0], hash6[ 0], hash7[ 0] ); + intrlv_8x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1], + hash4[ 1], hash5[ 1], hash6[ 1], hash7[ 1] ); + intrlv_8x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2], + hash4[ 2], hash5[ 2], hash6[ 2], hash7[ 2] ); + intrlv_8x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3], + hash4[ 3], hash5[ 3], hash6[ 3], hash7[ 3] ); + intrlv_8x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4], + hash4[ 4], hash5[ 4], hash6[ 4], hash7[ 4] ); + intrlv_8x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5], + hash4[ 5], hash5[ 5], hash6[ 5], hash7[ 5] ); + intrlv_8x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6], + hash4[ 6], hash5[ 6], hash6[ 6], hash7[ 6] ); + intrlv_8x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7], + hash4[ 7], hash5[ 7], hash6[ 7], hash7[ 7] ); + intrlv_8x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8], + hash4[ 8], hash5[ 8], hash6[ 8], hash7[ 8] ); + intrlv_8x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9], + hash4[ 9], hash5[ 9], hash6[ 9], hash7[ 9] ); + intrlv_8x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10], + hash4[10], hash5[10], hash6[10], hash7[10] ); + intrlv_8x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11], + hash4[11], hash5[11], hash6[11], hash7[11] ); + intrlv_8x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12], + hash4[12], hash5[12], hash6[12], hash7[12] ); + intrlv_8x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13], + hash4[13], hash5[13], hash6[13], hash7[13] ); + intrlv_8x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14], + hash4[14], hash5[14], hash6[14], hash7[14] ); + intrlv_8x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15], + hash4[15], hash5[15], hash6[15], hash7[15] ); + intrlv_8x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16], + hash4[16], hash5[16], hash6[16], hash7[16] ); + intrlv_8x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17], + hash4[17], hash5[17], hash6[17], hash7[17] ); + intrlv_8x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18], + hash4[18], hash5[18], hash6[18], hash7[18] ); + intrlv_8x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19], + hash4[19], hash5[19], hash6[19], hash7[19] ); + intrlv_8x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20], + hash4[20], hash5[20], hash6[20], hash7[20] ); + intrlv_8x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21], + hash4[21], hash5[21], hash6[21], hash7[21] ); + intrlv_8x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22], + hash4[22], hash5[22], hash6[22], hash7[22] ); + intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23], + hash4[23], hash5[23], hash6[23], hash7[23] ); + + blake2s_8way_init( &ctx.blake2s, 32 ); + blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); +} + +int scanhash_x25x_8way( struct work* work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + uint32_t *hash7 = &(hash[7<<3]); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + __m512i *noncev = (__m512i*)vdata + 9; // aligned + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 4; + const int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x08ff; + + InitializeSWIFFTX(); + + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + do + { + *noncev = mm512_intrlv_blend_32( mm512_bswap_32( + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + x25x_8way_hash( hash, vdata ); + + for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) + { + extr_lane_8x32( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined(X25X_4WAY) + +union _x25x_4way_ctx_overlay +{ + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + hashState_echo echo; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + hamsi512_4way_context hamsi; + sph_fugue512_context fugue; + shabal512_4way_context shabal; + sph_whirlpool_context whirlpool; + sha512_4way_context sha512; + haval256_5_4way_context haval; + sph_tiger_context tiger; + sph_gost512_context gost; + sha256_4way_context sha256; + sph_panama_context panama; + blake2s_4way_state blake2s; +}; +typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay; + +void x25x_4way_hash( void *output, const void *input ) +{ + uint64_t vhash[8*4] __attribute__ ((aligned (128))); unsigned char hash0[25][64] __attribute__((aligned(64))) = {0}; unsigned char hash1[25][64] __attribute__((aligned(64))) = {0}; unsigned char hash2[25][64] __attribute__((aligned(64))) = {0}; unsigned char hash3[25][64] __attribute__((aligned(64))) = {0}; - uint64_t vhash[8*4] __attribute__ ((aligned (64))); -// Doubling the size of vhashX breaks everything. It may have something -// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0]. -// Changing notation did seem to allow the larger buffer but still resulted -// in problems further along. -// unsigned char vhashX[24][64*8] __attribute__ ((aligned (64))); unsigned char vhashX[24][64*4] __attribute__ ((aligned (64))); x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64))); blake512_4way_init( &ctx.blake ); - blake512_4way( &ctx.blake, input, 80 ); + blake512_4way_update( &ctx.blake, input, 80 ); blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash ); bmw512_4way_init( &ctx.bmw ); - bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash ); @@ -118,24 +655,24 @@ void x25x_4way_hash( void *output, const void *input ) init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 ); - + intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] ); skein512_4way_init( &ctx.skein ); - skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash ); jh512_4way_init( &ctx.jh ); - jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash ); keccak512_4way_init( &ctx.keccak ); - keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash ); - + init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6], (const BitSequence*)hash0[5], 64 ); @@ -162,9 +699,9 @@ void x25x_4way_hash( void *output, const void *input ) cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7], (const byte*)hash3[6], 64 ); - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64); - sph_shavite512_close(&ctx.shavite, hash0[8]); + sph_shavite512_init(&ctx.shavite); + sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64); + sph_shavite512_close(&ctx.shavite, hash0[8]); sph_shavite512_init(&ctx.shavite); sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64); sph_shavite512_close(&ctx.shavite, hash1[8]); @@ -204,13 +741,13 @@ void x25x_4way_hash( void *output, const void *input ) intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] ); hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash ); - sph_fugue512_init(&ctx.fugue); - sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64); - sph_fugue512_close(&ctx.fugue, hash0[12]); + sph_fugue512_init(&ctx.fugue); + sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64); + sph_fugue512_close(&ctx.fugue, hash0[12]); sph_fugue512_init(&ctx.fugue); sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64); sph_fugue512_close(&ctx.fugue, hash1[12]); @@ -224,13 +761,13 @@ void x25x_4way_hash( void *output, const void *input ) intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] ); shabal512_4way_init( &ctx.shabal ); - shabal512_4way( &ctx.shabal, vhash, 64 ); + shabal512_4way_update( &ctx.shabal, vhash, 64 ); shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash ); - sph_whirlpool_init(&ctx.whirlpool); - sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64); - sph_whirlpool_close(&ctx.whirlpool, hash0[14]); + sph_whirlpool_init(&ctx.whirlpool); + sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64); + sph_whirlpool_close(&ctx.whirlpool, hash0[14]); sph_whirlpool_init(&ctx.whirlpool); sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64); sph_whirlpool_close(&ctx.whirlpool, hash1[14]); @@ -244,11 +781,10 @@ void x25x_4way_hash( void *output, const void *input ) intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] ); sha512_4way_init( &ctx.sha512 ); - sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash ); - ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]); ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]); ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]); @@ -257,15 +793,15 @@ void x25x_4way_hash( void *output, const void *input ) intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] ); memset( vhash, 0, 64*4 ); - + haval256_5_4way_init( &ctx.haval ); - haval256_5_4way( &ctx.haval, vhashX[0], 64 ); + haval256_5_4way_update( &ctx.haval, vhashX[0], 64 ); haval256_5_4way_close( &ctx.haval, vhash ); dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash ); - sph_tiger_init(&ctx.tiger); - sph_tiger (&ctx.tiger, (const void*) hash0[17], 64); - sph_tiger_close(&ctx.tiger, (void*) hash0[18]); + sph_tiger_init(&ctx.tiger); + sph_tiger (&ctx.tiger, (const void*) hash0[17], 64); + sph_tiger_close(&ctx.tiger, (void*) hash0[18]); sph_tiger_init(&ctx.tiger); sph_tiger (&ctx.tiger, (const void*) hash1[17], 64); sph_tiger_close(&ctx.tiger, (void*) hash1[18]); @@ -276,7 +812,7 @@ void x25x_4way_hash( void *output, const void *input ) sph_tiger (&ctx.tiger, (const void*) hash3[17], 64); sph_tiger_close(&ctx.tiger, (void*) hash3[18]); - LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32, + LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32, (const void*)hash0[18], 32, 1, 4, 4 ); LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32, (const void*)hash1[18], 32, 1, 4, 4 ); @@ -285,9 +821,9 @@ void x25x_4way_hash( void *output, const void *input ) LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32, (const void*)hash3[18], 32, 1, 4, 4 ); - sph_gost512_init(&ctx.gost); - sph_gost512 (&ctx.gost, (const void*) hash0[19], 64); - sph_gost512_close(&ctx.gost, (void*) hash0[20]); + sph_gost512_init(&ctx.gost); + sph_gost512 (&ctx.gost, (const void*) hash0[19], 64); + sph_gost512_close(&ctx.gost, (void*) hash0[20]); sph_gost512_init(&ctx.gost); sph_gost512 (&ctx.gost, (const void*) hash1[19], 64); sph_gost512_close(&ctx.gost, (void*) hash1[20]); @@ -302,7 +838,7 @@ void x25x_4way_hash( void *output, const void *input ) memset( vhash, 0, 64*4 ); sha256_4way_init( &ctx.sha256 ); - sha256_4way( &ctx.sha256, vhashX[0], 64 ); + sha256_4way_update( &ctx.sha256, vhashX[0], 64 ); sha256_4way_close( &ctx.sha256, vhash ); dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash ); @@ -356,20 +892,12 @@ void x25x_4way_hash( void *output, const void *input ) blake2s_4way_init( &ctx.blake2s, 32 ); blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); -/* - dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 ); - - memcpy(output, hash0[24], 32); - memcpy(output+32, hash1[24], 32); - memcpy(output+64, hash2[24], 32); - memcpy(output+96, hash3[24], 32); -*/ } int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[4*16] __attribute__ ((aligned (64))); + uint32_t hash[16*4] __attribute__ ((aligned (128))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash7 = &(hash[7<<2]); @@ -401,17 +929,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, { pdata[19] = n + lane; submit_lane_solution( work, lane_hash, mythr, lane ); - } + } } -/* - for ( int i = 0; i < 4; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } -*/ n += 4; } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); diff --git a/build-allarch.sh b/build-allarch.sh index 6e8fd89..ea69c63 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen +rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null make distclean || echo clean rm -f config.status diff --git a/build-avx2.sh b/build-avx2.sh new file mode 100755 index 0000000..7a12473 --- /dev/null +++ b/build-avx2.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +#if [ "$OS" = "Windows_NT" ]; then +# ./mingw64.sh +# exit 0 +#fi + +# Linux build + +make distclean || echo clean + +rm -f config.status +./autogen.sh || echo done + +# Ubuntu 10.04 (gcc 4.4) +# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16" + +# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) +#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" + +#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr +CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl +#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl + +make -j 4 + +strip -s cpuminer diff --git a/clean-all.sh b/clean-all.sh new file mode 100755 index 0000000..48a233e --- /dev/null +++ b/clean-all.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# +# imake clean and rm all the targetted executables. +# tips to users. + +rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null + +rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe > /dev/null + +make distclean > /dev/null diff --git a/configure b/configure index 3a5454b..76f55d4 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.6. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.10.6' -PACKAGE_STRING='cpuminer-opt 3.10.6' +PACKAGE_VERSION='3.11.0' +PACKAGE_STRING='cpuminer-opt 3.11.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.10.6 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.11.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.10.6:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.11.0:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.10.6 +cpuminer-opt configure 3.11.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.10.6, which was +It was created by cpuminer-opt $as_me 3.11.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.10.6' + VERSION='3.11.0' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.10.6, which was +This file was extended by cpuminer-opt $as_me 3.11.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.10.6 +cpuminer-opt config.status 3.11.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 613de42..c633926 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.10.6]) +AC_INIT([cpuminer-opt], [3.11.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index d1fb2d6..764b928 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1028,7 +1028,7 @@ static int share_result( int result, struct work *null_work, { // empty queue, it must have overflowed and stats were lost for a share. pthread_mutex_unlock( &stats_lock ); - applog(LOG_WARNING,"Pending shares overflow, stats for share are lost."); + applog(LOG_WARNING,"Share stats not available."); } // calculate latency and share time. diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 961c57d..64b8d7b 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -2283,7 +2283,46 @@ static inline void rintrlv_8x32_8x64( void *dst, d[63] = _mm_unpackhi_epi32( s[61], s[63] ); } +// 8x32 -> 4x128 +// 16 bytes per lane +#define RLEAVE_8X32_4X128( i ) \ +do { \ + uint32_t *d0 = (uint32_t*)dst0 + (i); \ + uint32_t *d1 = (uint32_t*)dst1 + (i); \ + const uint32_t *s = (const uint32_t*)src + ((i)<<1); \ + d0[ 0] = s[ 0]; d1[ 0] = s[ 4]; \ + d0[ 1] = s[ 8]; d1[ 1] = s[12]; \ + d0[ 2] = s[16]; d1[ 2] = s[20]; \ + d0[ 3] = s[24]; d1[ 3] = s[28]; \ +\ + d0[ 4] = s[ 1]; d1[ 4] = s[ 5]; \ + d0[ 5] = s[ 9]; d1[ 5] = s[13]; \ + d0[ 6] = s[17]; d1[ 6] = s[21]; \ + d0[ 7] = s[25]; d1[ 7] = s[29]; \ +\ + d0[ 8] = s[ 2]; d1[ 8] = s[ 6]; \ + d0[ 9] = s[10]; d1[ 9] = s[14]; \ + d0[10] = s[18]; d1[10] = s[22]; \ + d0[11] = s[26]; d1[11] = s[30]; \ +\ + d0[12] = s[ 3]; d1[12] = s[ 7]; \ + d0[13] = s[11]; d1[13] = s[15]; \ + d0[14] = s[19]; d1[14] = s[23]; \ + d0[15] = s[27]; d1[15] = s[31]; \ +} while(0) + +static inline void rintrlv_8x32_4x128( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + RLEAVE_8X32_4X128( 0 ); RLEAVE_8X32_4X128( 16 ); + if ( bit_len <= 256 ) return; + RLEAVE_8X32_4X128( 32 ); RLEAVE_8X32_4X128( 48 ); + if ( bit_len <= 512 ) return; + RLEAVE_8X32_4X128( 64 ); RLEAVE_8X32_4X128( 80 ); + RLEAVE_8X32_4X128( 96 ); RLEAVE_8X32_4X128( 112 ); +} +#undef RLEAVE_8X32_4X128 /* #define RLEAVE_4x32_4x64(i) do \ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 3bdde9b..ce9218c 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -42,17 +42,18 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2, return mm256_concat_128( hi, lo ); } -// Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all -// 128 bit lanes. -#define m256_const2_64( i1, i0 ) \ - _mm256_permute4x64_epi64( _mm256_castsi128_si256( \ - m128_const_64( i1, i0 ) ), 0x44 ) - // Equivalent of set1, broadcast integer constant to all elements. -#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) ) -#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) ) -#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) ) -#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) ) +#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v ) +#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) ) +#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) ) +#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) ) +#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) ) + +#define m256_const2_64( i1, i0 ) \ + m256_const1_128( m128_const_64( i1, i0 ) ) + +#define m126_const2_32( i1, i0 ) \ + m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) // diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 5a87979..03118fa 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -38,6 +38,36 @@ // shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually // doesn't cross 128 bit lane boundaries but is consistent with AVX2 // where shuffle_epi8 spans the entire vector. +// +// There are 2 areas where overhead is aconcern: constants and +// permutations. +// +// Constants need to be composed at run time by assembling individual +// elements, very expensive. The cost is proportional to the number of +// elements therefor use the largest element size possible, even by +// merging smaller values. +// +// Constants with repeating patterns can be optimized with the smaller +// patterns repeated more frequently being more efficient. +// +// Some specific constants can be very efficient. Zero is very efficient, +// 1 and -1 slightly less so. +// +// If an expensive constant is to be reused in the same function it should +// be declared as a local variable defined once and reused. +// +// Permutations cab be very exppensive if they use a vector control index, +// even if the permutation itself is quite efficient. +// The index is essentially a constant with all the baggage that brings. +// The same rules apply, if an index is to be reused it should be defined +// as a local. This applies specifically to bswap operations. +// +// Additionally, permutations using smaller vectors can be more efficient +// if the permutation doesn't cross lane boundaries ,typically 128 bits, +// ans the smnaller vector can use an imm comtrol. +// +// If the permutation doesn't cross lane boundaries a shuffle instructions +// can be used with imm control instead of permute. ////////////////////////////////////////////////////////////// // @@ -106,12 +136,14 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, #define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) ) #define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) ) +#define m512_const2_128( v1, v0 ) \ + m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( lo ), hi, 1 ) ) + #define m512_const2_64( i1, i0 ) \ m512_const1_128( m128_const_64( i1, i0 ) ) #define m512_const2_32( i1, i0 ) \ - m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \ - | ( (uint64_t)(i0) & 0xffffffff ) ) ) + m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) // { m128_1, m128_1, m128_0, m128_0 } #define m512_const_2x128( v1, v0 ) \ diff --git a/winbuild-cross.sh b/winbuild-cross.sh index f7f8968..c2d7720 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -27,6 +27,9 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h #sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac # make release directory and copy selected DLLs. + +rm -rf release > /dev/null + mkdir release cp README.txt release/ cp README.md release/ @@ -35,10 +38,6 @@ cp $MINGW_LIB/zlib1.dll release/ cp $MINGW_LIB/libwinpthread-1.dll release/ cp $GCC_MINGW_LIB/libstdc++-6.dll release/ cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/ -#cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/ -#cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/ -#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/ -#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/ cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/