From 1a7a57367543f645da655e67ca53f0b94c68b60f Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sat, 18 Jan 2020 15:14:27 -0500 Subject: [PATCH] v3.11.5 --- Makefile.am | 1 + RELEASE_NOTES | 61 +++ algo/blake/blake-4way.c | 4 +- algo/blake/blake-hash-4way.h | 45 +- algo/blake/blake2b-4way.c | 4 +- algo/blake/blake2b.c | 2 +- algo/blake/blake2s.c | 2 +- algo/blake/blake512-hash-4way.c | 463 +++++++++++++++---- algo/blake/blakecoin-4way.c | 8 +- algo/blake/decred-4way.c | 4 +- algo/blake/pentablake-4way.c | 10 +- algo/bmw/bmw512-4way.c | 5 +- algo/cubehash/cube-hash-2way.c | 118 +++++ algo/cubehash/cube-hash-2way.h | 14 +- algo/echo/aes_ni/hash.c | 168 ++++++- algo/echo/aes_ni/hash.c.test | 620 -------------------------- algo/echo/aes_ni/hash_api.h | 4 +- algo/echo/echo-hash-4way.c | 88 ++++ algo/echo/echo-hash-4way.h | 3 + algo/groestl/aes_ni/groestl-version.h | 7 - algo/groestl/aes_ni/hash-groestl.c | 83 +++- algo/groestl/aes_ni/hash-groestl.h | 1 + algo/groestl/aes_ni/hash-groestl256.c | 9 +- algo/groestl/aes_ni/hash-groestl256.h | 3 - algo/groestl/groestl-4way.c | 2 +- algo/groestl/groestl.c | 35 +- algo/groestl/groestl256-hash-4way.c | 267 ++--------- algo/groestl/groestl256-hash-4way.h | 102 ++--- algo/groestl/groestl256-intr-4way.h | 434 +++++++++--------- algo/groestl/groestl512-hash-4way.c | 91 ++-- algo/groestl/groestl512-hash-4way.h | 44 +- algo/groestl/myr-groestl.c | 28 +- algo/heavy/bastion.c | 49 +- algo/hodl/hodl-gate.c | 2 +- algo/jh/jh-hash-4way.c | 241 +++++----- algo/jh/jh-hash-4way.h | 1 - algo/jh/jha-4way.c | 2 +- algo/jh/jha.c | 71 ++- algo/keccak/keccak-4way.c | 4 +- algo/keccak/keccak-hash-4way.c | 4 +- algo/keccak/keccak-hash-4way.h | 8 - algo/luffa/luffa-hash-2way.c | 147 ++++++ algo/luffa/luffa-hash-2way.h | 20 +- algo/lyra2/allium-4way.c | 476 +++++++++++++------- algo/lyra2/lyra2-gate.c | 22 +- algo/lyra2/lyra2-gate.h | 20 +- algo/lyra2/lyra2-hash-2way.c | 134 ++++++ algo/lyra2/lyra2.h | 3 + algo/lyra2/lyra2h-4way.c | 2 +- algo/lyra2/sponge-2way.c | 412 +++++++++++++++-- algo/lyra2/sponge.h | 25 +- algo/m7m.c | 55 +-- algo/nist5/nist5-4way.c | 4 +- algo/quark/hmq1725-4way.c | 4 +- algo/quark/quark-4way.c | 31 +- algo/qubit/deep.c | 22 +- algo/qubit/qubit-2way.c | 51 +-- algo/qubit/qubit.c | 22 +- algo/shavite/shavite-hash-2way.c | 98 +++- algo/shavite/shavite-hash-2way.h | 2 + algo/shavite/shavite-hash-4way.c | 92 ++++ algo/shavite/shavite-hash-4way.h | 2 + algo/simd/simd-hash-2way.c | 171 +++++++ algo/simd/simd-hash-2way.h | 5 + algo/skein/skein-hash-4way.c | 36 +- algo/skein/skein-hash-4way.h | 24 +- algo/x11/c11-4way.c | 43 +- algo/x11/c11.c | 10 +- algo/x11/timetravel.c | 42 +- algo/x11/timetravel10.c | 43 +- algo/x11/tribus-4way.c | 4 +- algo/x11/tribus.c | 26 +- algo/x11/x11-4way.c | 2 +- algo/x11/x11evo.c | 43 +- algo/x11/x11gost-4way.c | 43 +- algo/x11/x11gost.c | 8 +- algo/x12/x12-4way.c | 7 +- algo/x13/phi1612.c | 28 +- algo/x13/x13-4way.c | 2 +- algo/x13/x13bcd-4way.c | 2 +- algo/x13/x13sm3-4way.c | 2 +- algo/x13/x13sm3.c | 8 +- algo/x14/polytimos.c | 36 +- algo/x14/x14-4way.c | 4 +- algo/x14/x14.c | 10 +- algo/x15/x15-4way.c | 4 +- algo/x16/x16r-4way.c | 198 +++----- algo/x16/x21s-4way.c | 4 + algo/x17/x17-4way.c | 209 +++------ algo/x17/xevan-4way.c | 332 +++++--------- algo/x22/x22i-4way.c | 26 +- algo/x22/x25x-4way.c | 8 +- algo/yescrypt/yescrypt.c | 2 +- algo/yespower/yespower-blake2b.c | 2 +- algo/yespower/yespower-gate.c | 2 +- algo/yespower/yespower-opt.c | 2 +- build-allarch.sh | 32 +- clean-all.sh | 4 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 173 ++++--- miner.h | 12 - simd-utils/intrlv.h | 2 + simd-utils/simd-512.h | 6 +- sysinfos.c | 17 +- 105 files changed, 3579 insertions(+), 2833 deletions(-) delete mode 100644 algo/echo/aes_ni/hash.c.test diff --git a/Makefile.am b/Makefile.am index 14154ed..c9a861e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -89,6 +89,7 @@ cpuminer_SOURCES = \ algo/gost/sph_gost.c \ algo/groestl/groestl-gate.c \ algo/groestl/groestl512-hash-4way.c \ + algo/groestl/groestl256-hash-4way.c \ algo/groestl/sph_groestl.c \ algo/groestl/groestl.c \ algo/groestl/groestl-4way.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index a82147f..60d2aed 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -33,9 +33,70 @@ supported. 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi are not supported. FreeBSD YMMV. +Reporting bugs +-------------- + +Bugs can be reported by sending am email to JayDDee246@gmail.com or opening +an issue in git: https://github.com/JayDDee/cpuminer-opt/issues + +Please include the following information: + +1. CPU model, operating system, cpuminer-opt version (must be latest), + binary file for Windows, changes to default build procedure for Linux. + +2. Exact comand line (except user and pw) and intial output showing + the above requested info. + +3. Additional program output showing any error messages or other + pertinent data. + +4. A clear description of the problem including history, scope, + persistence or intermittance, and reproduceability. + +In simpler terms: + +What is it doing? +What should it be doing instead? +Did it work in a previous release? +Does it happen for all algos? All pools? All options? Solo? +Does it happen all the time? +If not what makes it happen or not happen? + Change Log ---------- +v3.11.5 + +Fixed AVX512 detection that could cause compilation errors on CPUs +without AVX512. + +Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block +is solved. +Added share counter to share submitited & accepted logs +Added job id to share submitted log. +Share submitted log is no longer highlighted blue, there was too much blue. + +Another CPU temperature fix for Linux. + +Added bug reporting tips to RELEASE NOTES. + +v3.11.4 + +Fixed scrypt segfault since v3.9.9.1. + +Stale shares counted and reported seperately from other rejected shares. + +Display of counters for solved blocks, rejects, stale shares suppressed in +periodic summary when zero. + +v3.11.3 + +Fixed x12 AVX2 again. + +More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%. + +Restored lost speed for x22i & x25x. + v3.11.2 Fixed x11gost (sib) AVX2 invalid shares. diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 8d1372f..c397a40 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input) uint32_t vhash[8*4] __attribute__ ((aligned (64))); blake256r14_4way_context ctx; memcpy( &ctx, &blake_4w_ctx, sizeof ctx ); - blake256r14_4way( &ctx, input + (64<<2), 16 ); + blake256r14_4way_update( &ctx, input + (64<<2), 16 ); blake256r14_4way_close( &ctx, vhash ); dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); } @@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake256r14_4way_init( &blake_4w_ctx ); - blake256r14_4way( &blake_4w_ctx, vdata, 64 ); + blake256r14_4way_update( &blake_4w_ctx, vdata, 64 ); do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 091a537..fc64583 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -37,8 +37,6 @@ #ifndef __BLAKE_HASH_4WAY__ #define __BLAKE_HASH_4WAY__ 1 -//#ifdef __SSE4_2__ - #ifdef __cplusplus extern "C"{ #endif @@ -51,46 +49,41 @@ extern "C"{ #define SPH_SIZE_blake512 512 -// With SSE4.2 only Blake-256 4 way is available. -// With AVX2 Blake-256 8way & Blake-512 4 way are also available. - -// Blake-256 4 way +////////////////////////// +// +// Blake-256 4 way SSE2 typedef struct { unsigned char buf[64<<2]; uint32_t H[8<<2]; -// __m128i buf[16] __attribute__ ((aligned (64))); -// __m128i H[8]; -// __m128i S[4]; size_t ptr; uint32_t T0, T1; int rounds; // 14 for blake, 8 for blakecoin & vanilla } blake_4way_small_context __attribute__ ((aligned (64))); -// Default 14 rounds +// Default, 14 rounds, blake, decred typedef blake_4way_small_context blake256_4way_context; void blake256_4way_init(void *ctx); void blake256_4way_update(void *ctx, const void *data, size_t len); -#define blake256_4way blake256_4way_update void blake256_4way_close(void *ctx, void *dst); // 14 rounds, blake, decred typedef blake_4way_small_context blake256r14_4way_context; void blake256r14_4way_init(void *cc); void blake256r14_4way_update(void *cc, const void *data, size_t len); -#define blake256r14_4way blake256r14_4way_update void blake256r14_4way_close(void *cc, void *dst); // 8 rounds, blakecoin, vanilla typedef blake_4way_small_context blake256r8_4way_context; void blake256r8_4way_init(void *cc); void blake256r8_4way_update(void *cc, const void *data, size_t len); -#define blake256r8_4way blake256r8_4way_update void blake256r8_4way_close(void *cc, void *dst); #ifdef __AVX2__ -// Blake-256 8 way +////////////////////////// +// +// Blake-256 8 way AVX2 typedef struct { __m256i buf[16] __attribute__ ((aligned (64))); @@ -104,7 +97,6 @@ typedef struct { typedef blake_8way_small_context blake256_8way_context; void blake256_8way_init(void *cc); void blake256_8way_update(void *cc, const void *data, size_t len); -//#define blake256_8way blake256_8way_update void blake256_8way_close(void *cc, void *dst); // 14 rounds, blake, decred @@ -117,10 +109,9 @@ void blake256r14_8way_close(void *cc, void *dst); typedef blake_8way_small_context blake256r8_8way_context; void blake256r8_8way_init(void *cc); void blake256r8_8way_update(void *cc, const void *data, size_t len); -#define blake256r8_8way blake256r8_8way_update void blake256r8_8way_close(void *cc, void *dst); -// Blake-512 4 way +// Blake-512 4 way AVX2 typedef struct { __m256i buf[16]; @@ -134,14 +125,15 @@ typedef blake_4way_big_context blake512_4way_context; void blake512_4way_init( blake_4way_big_context *sc ); void blake512_4way_update( void *cc, const void *data, size_t len ); -#define blake512_4way blake512_4way_update void blake512_4way_close( void *cc, void *dst ); -void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void blake512_4way_full( blake_4way_big_context *sc, void * dst, + const void *data, size_t len ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -//Blake-256 16 way +//////////////////////////// +// +// Blake-256 16 way AVX512 typedef struct { __m512i buf[16]; @@ -169,8 +161,9 @@ void blake256r8_16way_init(void *cc); void blake256r8_16way_update(void *cc, const void *data, size_t len); void blake256r8_16way_close(void *cc, void *dst); - -// Blake-512 8 way +//////////////////////////// +// +//// Blake-512 8 way AVX512 typedef struct { __m512i buf[16]; @@ -185,12 +178,10 @@ typedef blake_8way_big_context blake512_8way_context; void blake512_8way_init( blake_8way_big_context *sc ); void blake512_8way_update( void *cc, const void *data, size_t len ); void blake512_8way_close( void *cc, void *dst ); -void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void blake512_8way_full( blake_8way_big_context *sc, void * dst, + const void *data, size_t len ); #endif // AVX512 - - #endif // AVX2 #ifdef __cplusplus diff --git a/algo/blake/blake2b-4way.c b/algo/blake/blake2b-4way.c index 9983cc3..a9ed0a7 100644 --- a/algo/blake/blake2b-4way.c +++ b/algo/blake/blake2b-4way.c @@ -39,7 +39,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, blake2b_8way_final( &ctx, hash ); for ( int lane = 0; lane < 8; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) @@ -94,7 +94,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, blake2b_4way_final( &ctx, hash ); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index 1d8974f..63ee8eb 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -45,7 +45,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce, be32enc(&endiandata[19], n); blake2b_hash(vhashcpu, endiandata); - if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { pdata[19] = n; submit_solution( work, vhashcpu, mythr ); diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c index ec5b46f..442859c 100644 --- a/algo/blake/blake2s.c +++ b/algo/blake/blake2s.c @@ -56,7 +56,7 @@ int scanhash_blake2s( struct work *work, do { be32enc(&endiandata[19], n); blake2s_hash( hash64, endiandata ); - if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { + if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) { *hashes_done = n - first_nonce + 1; pdata[19] = n; return true; diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index d64fd9b..a5d5394 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -267,22 +267,22 @@ static const sph_u64 CB[16] = { #define CBx_(n) CBx__(n) #define CBx__(n) CB ## n -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) +#define CB0 0x243F6A8885A308D3 +#define CB1 0x13198A2E03707344 +#define CB2 0xA4093822299F31D0 +#define CB3 0x082EFA98EC4E6C89 +#define CB4 0x452821E638D01377 +#define CB5 0xBE5466CF34E90C6C +#define CB6 0xC0AC29B7C97C50DD +#define CB7 0x3F84D5B5B5470917 +#define CB8 0x9216D5D98979FB1B +#define CB9 0xD1310BA698DFB5AC +#define CBA 0x2FFD72DBD01ADFB7 +#define CBB 0xB8E1AFED6A267E96 +#define CBC 0xBA7C9045F12C7F99 +#define CBD 0x24A19947B3916CF7 +#define CBE 0x0801F2E2858EFC16 +#define CBF 0x636920D871574E69 #define READ_STATE64(state) do { \ H0 = (state)->H[0]; \ @@ -349,9 +349,9 @@ static const sph_u64 CB[16] = { #define DECL_STATE64_8WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ __m512i S0, S1, S2, S3; \ - sph_u64 T0, T1; + uint64_t T0, T1; -#define COMPRESS64_8WAY do \ +#define COMPRESS64_8WAY( buf ) do \ { \ __m512i M0, M1, M2, M3, M4, M5, M6, M7; \ __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ @@ -424,6 +424,84 @@ static const sph_u64 CB[16] = { H7 = mm512_xor4( VF, V7, S3, H7 ); \ } while (0) +void blake512_8way_compress( blake_8way_big_context *sc ) +{ + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + __m512i V0, V1, V2, V3, V4, V5, V6, V7; + __m512i V8, V9, VA, VB, VC, VD, VE, VF; + __m512i shuf_bswap64; + + V0 = sc->H[0]; + V1 = sc->H[1]; + V2 = sc->H[2]; + V3 = sc->H[3]; + V4 = sc->H[4]; + V5 = sc->H[5]; + V6 = sc->H[6]; + V7 = sc->H[7]; + V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) ); + V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) ); + VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) ); + VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) ); + VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), + m512_const1_64( CB4 ) ); + VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), + m512_const1_64( CB5 ) ); + VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ), + m512_const1_64( CB6 ) ); + VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ), + m512_const1_64( CB7 ) ); + + shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, + 0x28292a2b2c2d2e2f, 0x2021222324252627, + 0x18191a1b1c1d1e1f, 0x1011121314151617, + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); + + M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); + M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 ); + M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 ); + M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 ); + M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 ); + M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 ); + M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 ); + M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 ); + M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 ); + M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 ); + MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 ); + MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 ); + MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 ); + MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 ); + ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 ); + MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 ); + + ROUND_B_8WAY(0); + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + ROUND_B_8WAY(6); + ROUND_B_8WAY(7); + ROUND_B_8WAY(8); + ROUND_B_8WAY(9); + ROUND_B_8WAY(0); + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + + sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] ); + sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] ); + sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] ); + sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] ); + sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] ); + sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] ); + sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] ); + sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] ); +} + void blake512_8way_init( blake_8way_big_context *sc ) { __m512i zero = m512_zero; @@ -455,39 +533,43 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len ) const int buf_size = 128; // sizeof/8 +// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress. +// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress. + buf = sc->buf; ptr = sc->ptr; if ( len < (buf_size - ptr) ) { - memcpy_512( buf + (ptr>>3), vdata, len>>3 ); - ptr += len; - sc->ptr = ptr; - return; + memcpy_512( buf + (ptr>>3), vdata, len>>3 ); + ptr += len; + sc->ptr = ptr; + return; } READ_STATE64(sc); while ( len > 0 ) { - size_t clen; + size_t clen; - clen = buf_size - ptr; - if ( clen > len ) + clen = buf_size - ptr; + if ( clen > len ) clen = len; - memcpy_512( buf + (ptr>>3), vdata, clen>>3 ); - ptr += clen; - vdata = vdata + (clen>>3); - len -= clen; - if ( ptr == buf_size ) - { - if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 ) - T1 = SPH_T64(T1 + 1); - COMPRESS64_8WAY; - ptr = 0; - } + memcpy_512( buf + (ptr>>3), vdata, clen>>3 ); + ptr += clen; + vdata = vdata + (clen>>3); + len -= clen; + if ( ptr == buf_size ) + { + if ( ( T0 = T0 + 1024 ) < 1024 ) + T1 = T1 + 1; + COMPRESS64_8WAY( buf ); + ptr = 0; + } } WRITE_STATE64(sc); sc->ptr = ptr; -} + + } static void blake64_8way_close( blake_8way_big_context *sc, void *dst ) @@ -495,26 +577,22 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst ) __m512i buf[16]; size_t ptr; unsigned bit_len; -// uint64_t z, zz; - sph_u64 th, tl; + uint64_t th, tl; ptr = sc->ptr; bit_len = ((unsigned)ptr << 3); -// z = 0x80 >> n; -// zz = ((ub & -z) | z) & 0xFF; -// buf[ptr>>3] = _mm512_set1_epi64( zz ); buf[ptr>>3] = m512_const1_64( 0x80 ); tl = sc->T0 + bit_len; th = sc->T1; if (ptr == 0 ) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; } else if ( sc->T0 == 0 ) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len; - sc->T1 = SPH_T64(sc->T1 - 1); + sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len; + sc->T1 = sc->T1 - 1; } else { @@ -535,8 +613,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst ) memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 ); blake64_8way( sc, buf + (ptr>>3), 128 - ptr ); - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; memset_zero_512( buf, 112>>3 ); buf[104>>3] = m512_const1_64( 0x0100000000000000ULL ); buf[112>>3] = m512_const1_64( bswap_64( th ) ); @@ -547,6 +625,79 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst ) mm512_block_bswap_64( (__m512i*)dst, sc->H ); } +// init, update & close +void blake512_8way_full( blake_8way_big_context *sc, void * dst, + const void *data, size_t len ) +{ + +// init + + casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); + casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); + casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); + casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 ); + casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F ); + casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); + + casti_m512i( sc->S, 0 ) = m512_zero; + casti_m512i( sc->S, 1 ) = m512_zero; + casti_m512i( sc->S, 2 ) = m512_zero; + casti_m512i( sc->S, 3 ) = m512_zero; + + sc->T0 = sc->T1 = 0; + sc->ptr = 0; + +// update + + memcpy_512( sc->buf, (__m512i*)data, len>>3 ); + sc->ptr = len; + if ( len == 128 ) + { + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + blake512_8way_compress( sc ); + sc->ptr = 0; + } + +// close + + size_t ptr64 = sc->ptr >> 3; + unsigned bit_len; + uint64_t th, tl; + + bit_len = sc->ptr << 3; + sc->buf[ptr64] = m512_const1_64( 0x80 ); + tl = sc->T0 + bit_len; + th = sc->T1; + + if ( ptr64 == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; + } + else if ( sc->T0 == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len; + sc->T1 = sc->T1 - 1; + } + else + sc->T0 -= 1024 - bit_len; + + memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 ); + sc->buf[13] = m512_const1_64( 0x0100000000000000ULL ); + sc->buf[14] = m512_const1_64( bswap_64( th ) ); + sc->buf[15] = m512_const1_64( bswap_64( tl ) ); + + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + + blake512_8way_compress( sc ); + + mm512_block_bswap_64( (__m512i*)dst, sc->H ); +} + void blake512_8way_update(void *cc, const void *data, size_t len) { @@ -555,12 +706,6 @@ blake512_8way_update(void *cc, const void *data, size_t len) void blake512_8way_close(void *cc, void *dst) -{ - blake512_8way_addbits_and_close(cc, 0, 0, dst); -} - -void -blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { blake64_8way_close(cc, dst); } @@ -596,7 +741,7 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #define DECL_STATE64_4WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ __m256i S0, S1, S2, S3; \ - sph_u64 T0, T1; + uint64_t T0, T1; #define COMPRESS64_4WAY do \ { \ @@ -670,6 +815,81 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) } while (0) +void blake512_4way_compress( blake_4way_big_context *sc ) +{ + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i M8, M9, MA, MB, MC, MD, ME, MF; + __m256i V0, V1, V2, V3, V4, V5, V6, V7; + __m256i V8, V9, VA, VB, VC, VD, VE, VF; + __m256i shuf_bswap64; + + V0 = sc->H[0]; + V1 = sc->H[1]; + V2 = sc->H[2]; + V3 = sc->H[3]; + V4 = sc->H[4]; + V5 = sc->H[5]; + V6 = sc->H[6]; + V7 = sc->H[7]; + V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) ); + V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) ); + VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) ); + VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) ); + VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), + m256_const1_64( CB4 ) ); + VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ), + m256_const1_64( CB5 ) ); + VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ), + m256_const1_64( CB6 ) ); + VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ), + m256_const1_64( CB7 ) ); + shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); + + M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); + M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 ); + M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 ); + M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 ); + M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 ); + M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 ); + M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 ); + M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 ); + M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 ); + M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 ); + MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 ); + MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 ); + MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 ); + MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 ); + ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 ); + MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 ); + + ROUND_B_4WAY(0); + ROUND_B_4WAY(1); + ROUND_B_4WAY(2); + ROUND_B_4WAY(3); + ROUND_B_4WAY(4); + ROUND_B_4WAY(5); + ROUND_B_4WAY(6); + ROUND_B_4WAY(7); + ROUND_B_4WAY(8); + ROUND_B_4WAY(9); + ROUND_B_4WAY(0); + ROUND_B_4WAY(1); + ROUND_B_4WAY(2); + ROUND_B_4WAY(3); + ROUND_B_4WAY(4); + ROUND_B_4WAY(5); + + sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] ); + sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] ); + sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] ); + sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] ); + sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] ); + sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] ); + sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] ); + sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] ); +} + void blake512_4way_init( blake_4way_big_context *sc ) { __m256i zero = m256_zero; @@ -681,10 +901,12 @@ void blake512_4way_init( blake_4way_big_context *sc ) casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F ); casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); + casti_m256i( sc->S, 0 ) = zero; casti_m256i( sc->S, 1 ) = zero; casti_m256i( sc->S, 2 ) = zero; casti_m256i( sc->S, 3 ) = zero; + sc->T0 = sc->T1 = 0; sc->ptr = 0; } @@ -703,31 +925,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len) ptr = sc->ptr; if ( len < (buf_size - ptr) ) { - memcpy_256( buf + (ptr>>3), vdata, len>>3 ); - ptr += len; - sc->ptr = ptr; - return; + memcpy_256( buf + (ptr>>3), vdata, len>>3 ); + ptr += len; + sc->ptr = ptr; + return; } READ_STATE64(sc); while ( len > 0 ) { - size_t clen; + size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); - ptr += clen; - vdata = vdata + (clen>>3); - len -= clen; - if (ptr == buf_size ) - { - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - T1 = SPH_T64(T1 + 1); - COMPRESS64_4WAY; - ptr = 0; - } + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); + ptr += clen; + vdata = vdata + (clen>>3); + len -= clen; + if ( ptr == buf_size ) + { + if ( (T0 = T0 + 1024 ) < 1024 ) + T1 = SPH_T64(T1 + 1); + COMPRESS64_4WAY; + ptr = 0; + } } WRITE_STATE64(sc); sc->ptr = ptr; @@ -739,7 +961,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst ) __m256i buf[16]; size_t ptr; unsigned bit_len; - sph_u64 th, tl; + uint64_t th, tl; ptr = sc->ptr; bit_len = ((unsigned)ptr << 3); @@ -748,13 +970,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst ) th = sc->T1; if (ptr == 0 ) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL); - sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; } else if ( sc->T0 == 0 ) { - sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len; - sc->T1 = SPH_T64(sc->T1 - 1); + sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len; + sc->T1 = sc->T1 - 1; } else { @@ -788,13 +1010,77 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst ) mm256_block_bswap_64( (__m256i*)dst, sc->H ); } -/* -void -blake512_4way_init(void *cc) +// init, update & close +void blake512_4way_full( blake_4way_big_context *sc, void * dst, + const void *data, size_t len ) { - blake64_4way_init(cc, IV512, salt_zero_big); + +// init + + casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); + casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); + casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); + casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 ); + casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F ); + casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); + + casti_m256i( sc->S, 0 ) = m256_zero; + casti_m256i( sc->S, 1 ) = m256_zero; + casti_m256i( sc->S, 2 ) = m256_zero; + casti_m256i( sc->S, 3 ) = m256_zero; + + sc->T0 = sc->T1 = 0; + sc->ptr = 0; + +// update + + memcpy_256( sc->buf, (__m256i*)data, len>>3 ); + sc->ptr += len; + if ( len == 128 ) + { + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + blake512_4way_compress( sc ); + sc->ptr = 0; + } + +// close + + size_t ptr64 = sc->ptr >> 3; + unsigned bit_len; + uint64_t th, tl; + + bit_len = sc->ptr << 3; + sc->buf[ptr64] = m256_const1_64( 0x80 ); + tl = sc->T0 + bit_len; + th = sc->T1; + if ( sc->ptr == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; + } + else if ( sc->T0 == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len; + sc->T1 = sc->T1 - 1; + } + else + sc->T0 -= 1024 - bit_len; + + memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 ); + sc->buf[13] = m256_const1_64( 0x0100000000000000ULL ); + sc->buf[14] = m256_const1_64( bswap_64( th ) ); + sc->buf[15] = m256_const1_64( bswap_64( tl ) ); + + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + + blake512_4way_compress( sc ); + + mm256_block_bswap_64( (__m256i*)dst, sc->H ); } -*/ void blake512_4way_update(void *cc, const void *data, size_t len) @@ -806,17 +1092,8 @@ void blake512_4way_close(void *cc, void *dst) { blake64_4way_close( cc, dst ); - -// blake512_4way_addbits_and_close(cc, dst); } -/* -void -blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - blake64_4way_close(cc, ub, n, dst, 8); -} -*/ #ifdef __cplusplus } #endif diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c index 898cbe3..de0e073 100644 --- a/algo/blake/blakecoin-4way.c +++ b/algo/blake/blakecoin-4way.c @@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input) blake256r8_4way_context ctx; memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx ); - blake256r8_4way( &ctx, input + (64<<2), 16 ); + blake256r8_4way_update( &ctx, input + (64<<2), 16 ); blake256r8_4way_close( &ctx, vhash ); dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); @@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, mm128_bswap32_intrlv80_4x32( vdata, pdata ); blake256r8_4way_init( &blakecoin_4w_ctx ); - blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 ); + blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 ); do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); @@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input ) blake256r8_8way_context ctx; memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx ); - blake256r8_8way( &ctx, input + (64<<3), 16 ); + blake256r8_8way_update( &ctx, input + (64<<3), 16 ); blake256r8_8way_close( &ctx, vhash ); dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128, @@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); blake256r8_8way_init( &blakecoin_8w_ctx ); - blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 ); + blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 ); do { *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, diff --git a/algo/blake/decred-4way.c b/algo/blake/decred-4way.c index 1cbf157..27d9c18 100644 --- a/algo/blake/decred-4way.c +++ b/algo/blake/decred-4way.c @@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input ) blake256_4way_context ctx __attribute__ ((aligned (64))); memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); - blake256_4way( &ctx, tail, tail_len ); + blake256_4way_update( &ctx, tail, tail_len ); blake256_4way_close( &ctx, vhash ); dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); } @@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce, mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 ); blake256_4way_init( &blake_mid ); - blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); + blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4; do { diff --git a/algo/blake/pentablake-4way.c b/algo/blake/pentablake-4way.c index 1b45afa..5683174 100644 --- a/algo/blake/pentablake-4way.c +++ b/algo/blake/pentablake-4way.c @@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input ) blake512_4way_init( &ctx ); - blake512_4way( &ctx, input, 80 ); + blake512_4way_update( &ctx, input, 80 ); blake512_4way_close( &ctx, vhash ); blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); + blake512_4way_update( &ctx, vhash, 64 ); blake512_4way_close( &ctx, vhash ); blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); + blake512_4way_update( &ctx, vhash, 64 ); blake512_4way_close( &ctx, vhash ); blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); + blake512_4way_update( &ctx, vhash, 64 ); blake512_4way_close( &ctx, vhash ); blake512_4way_init( &ctx ); - blake512_4way( &ctx, vhash, 64 ); + blake512_4way_update( &ctx, vhash, 64 ); blake512_4way_close( &ctx, vhash ); memcpy( output, hash0, 32 ); diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c index 795be11..ff59611 100644 --- a/algo/bmw/bmw512-4way.c +++ b/algo/bmw/bmw512-4way.c @@ -40,7 +40,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce, bmw512hash_8way( hash, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash7[ lane<<1 ] < Htarg ) ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) ) @@ -93,8 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce, bmw512hash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( unlikely( hash7[ lane<<1 ] < Htarg ) ) -// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) ) diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 5a4af53..9a9dfc8 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -168,6 +168,66 @@ int cube_4way_close( cube_4way_context *sp, void *output ) return 0; } +int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, + const void *data, size_t size ) +{ + __m512i *h = (__m512i*)sp->h; + __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512 + : (__m128i*)IV256 ); + sp->hashlen = hashbitlen/128; + sp->blocksize = 32/16; + sp->rounds = 16; + sp->pos = 0; + + h[ 0] = m512_const1_128( iv[0] ); + h[ 1] = m512_const1_128( iv[1] ); + h[ 2] = m512_const1_128( iv[2] ); + h[ 3] = m512_const1_128( iv[3] ); + h[ 4] = m512_const1_128( iv[4] ); + h[ 5] = m512_const1_128( iv[5] ); + h[ 6] = m512_const1_128( iv[6] ); + h[ 7] = m512_const1_128( iv[7] ); + h[ 0] = m512_const1_128( iv[0] ); + h[ 1] = m512_const1_128( iv[1] ); + h[ 2] = m512_const1_128( iv[2] ); + h[ 3] = m512_const1_128( iv[3] ); + h[ 4] = m512_const1_128( iv[4] ); + h[ 5] = m512_const1_128( iv[5] ); + h[ 6] = m512_const1_128( iv[6] ); + h[ 7] = m512_const1_128( iv[7] ); + + const int len = size >> 4; + const __m512i *in = (__m512i*)data; + __m512i *hash = (__m512i*)output; + int i; + + for ( i = 0; i < len; i++ ) + { + sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] ); + sp->pos++; + if ( sp->pos == sp->blocksize ) + { + transform_4way( sp ); + sp->pos = 0; + } + } + + // pos is zero for 64 byte data, 1 for 80 byte data. + sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], + m512_const2_64( 0, 0x0000000000000080 ) ); + transform_4way( sp ); + + sp->h[7] = _mm512_xor_si512( sp->h[7], + m512_const2_64( 0x0000000100000000, 0 ) ); + + for ( i = 0; i < 10; ++i ) + transform_4way( sp ); + + memcpy( hash, sp->h, sp->hashlen<<6); + return 0; +} + + int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ) { @@ -376,4 +436,62 @@ int cube_2way_update_close( cube_2way_context *sp, void *output, return 0; } +int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen, + const void *data, size_t size ) +{ + __m256i *h = (__m256i*)sp->h; + __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512 + : (__m128i*)IV256 ); + sp->hashlen = hashbitlen/128; + sp->blocksize = 32/16; + sp->rounds = 16; + sp->pos = 0; + + h[ 0] = m256_const1_128( iv[0] ); + h[ 1] = m256_const1_128( iv[1] ); + h[ 2] = m256_const1_128( iv[2] ); + h[ 3] = m256_const1_128( iv[3] ); + h[ 4] = m256_const1_128( iv[4] ); + h[ 5] = m256_const1_128( iv[5] ); + h[ 6] = m256_const1_128( iv[6] ); + h[ 7] = m256_const1_128( iv[7] ); + h[ 0] = m256_const1_128( iv[0] ); + h[ 1] = m256_const1_128( iv[1] ); + h[ 2] = m256_const1_128( iv[2] ); + h[ 3] = m256_const1_128( iv[3] ); + h[ 4] = m256_const1_128( iv[4] ); + h[ 5] = m256_const1_128( iv[5] ); + h[ 6] = m256_const1_128( iv[6] ); + h[ 7] = m256_const1_128( iv[7] ); + + const int len = size >> 4; + const __m256i *in = (__m256i*)data; + __m256i *hash = (__m256i*)output; + int i; + + for ( i = 0; i < len; i++ ) + { + sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] ); + sp->pos++; + if ( sp->pos == sp->blocksize ) + { + transform_2way( sp ); + sp->pos = 0; + } + } + + // pos is zero for 64 byte data, 1 for 80 byte data. + sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], + m256_const2_64( 0, 0x0000000000000080 ) ); + transform_2way( sp ); + + sp->h[7] = _mm256_xor_si256( sp->h[7], + m256_const2_64( 0x0000000100000000, 0 ) ); + + for ( i = 0; i < 10; ++i ) transform_2way( sp ); + + memcpy( hash, sp->h, sp->hashlen<<5 ); + return 0; +} + #endif diff --git a/algo/cubehash/cube-hash-2way.h b/algo/cubehash/cube-hash-2way.h index d99d926..eddd813 100644 --- a/algo/cubehash/cube-hash-2way.h +++ b/algo/cubehash/cube-hash-2way.h @@ -21,15 +21,12 @@ typedef struct _cube_4way_context cube_4way_context; int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds, int blockbytes ); -// reinitialize context with same parameters, much faster. -int cube_4way_reinit( cube_4way_context *sp ); - int cube_4way_update( cube_4way_context *sp, const void *data, size_t size ); - int cube_4way_close( cube_4way_context *sp, void *output ); - int cube_4way_update_close( cube_4way_context *sp, void *output, const void *data, size_t size ); +int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen, + const void *data, size_t size ); #endif @@ -48,15 +45,12 @@ typedef struct _cube_2way_context cube_2way_context; int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds, int blockbytes ); -// reinitialize context with same parameters, much faster. -int cube_2way_reinit( cube_2way_context *sp ); - int cube_2way_update( cube_2way_context *sp, const void *data, size_t size ); - int cube_2way_close( cube_2way_context *sp, void *output ); - int cube_2way_update_close( cube_2way_context *sp, void *output, const void *data, size_t size ); +int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen, + const void *data, size_t size ); #endif diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 41b5c20..55b27c6 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -7,7 +7,6 @@ * - implements NIST hash api * - assumes that message lenght is multiple of 8-bits * - _ECHO_VPERM_ must be defined if compiling with ../main.c - * - define NO_AES_NI for aes_ni version * * Cagdas Calik * ccalik@metu.edu.tr @@ -21,13 +20,7 @@ #include "hash_api.h" //#include "vperm.h" #include -/* -#ifndef NO_AES_NI -#include -#else -#include -#endif -*/ +#include "simd-utils.h" MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F}; MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC}; @@ -525,6 +518,165 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, return SUCCESS; } +HashReturn echo_full( hashState_echo *state, BitSequence *hashval, + int nHashSize, const BitSequence *data, DataLength datalen ) +{ + int i, j; + + state->k = m128_zero; + state->processed_bits = 0; + state->uBufferBytes = 0; + + switch( nHashSize ) + { + case 256: + state->uHashSize = 256; + state->uBlockLength = 192; + state->uRounds = 8; + state->hashsize = m128_const_64( 0, 0x100 ); + state->const1536 = m128_const_64( 0, 0x600 ); + break; + + case 512: + state->uHashSize = 512; + state->uBlockLength = 128; + state->uRounds = 10; + state->hashsize = m128_const_64( 0, 0x200 ); + state->const1536 = m128_const_64( 0, 0x400 ); + break; + + default: + return BAD_HASHBITLEN; + } + + for(i = 0; i < 4; i++) + for(j = 0; j < nHashSize / 256; j++) + state->state[i][j] = state->hashsize; + + for(i = 0; i < 4; i++) + for(j = nHashSize / 256; j < 4; j++) + state->state[i][j] = m128_zero; + + + unsigned int uBlockCount, uRemainingBytes; + + if( (state->uBufferBytes + datalen) >= state->uBlockLength ) + { + if( state->uBufferBytes != 0 ) + { + // Fill the buffer + memcpy( state->buffer + state->uBufferBytes, + (void*)data, state->uBlockLength - state->uBufferBytes ); + + // Process buffer + Compress( state, state->buffer, 1 ); + state->processed_bits += state->uBlockLength * 8; + + data += state->uBlockLength - state->uBufferBytes; + datalen -= state->uBlockLength - state->uBufferBytes; + } + + // buffer now does not contain any unprocessed bytes + + uBlockCount = datalen / state->uBlockLength; + uRemainingBytes = datalen % state->uBlockLength; + + if( uBlockCount > 0 ) + { + Compress( state, data, uBlockCount ); + state->processed_bits += uBlockCount * state->uBlockLength * 8; + data += uBlockCount * state->uBlockLength; + } + + if( uRemainingBytes > 0 ) + memcpy(state->buffer, (void*)data, uRemainingBytes); + + state->uBufferBytes = uRemainingBytes; + } + else + { + memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen ); + state->uBufferBytes += datalen; + } + + __m128i remainingbits; + + // Add remaining bytes in the buffer + state->processed_bits += state->uBufferBytes * 8; + + remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 ); + + // Pad with 0x80 + state->buffer[state->uBufferBytes++] = 0x80; + // Enough buffer space for padding in this block? + if( (state->uBlockLength - state->uBufferBytes) >= 18 ) + { + // Pad with zeros + memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + + // Last block contains message bits? + if( state->uBufferBytes == 1 ) + { + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + else + { + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + } + + // Compress + Compress( state, state->buffer, 1 ); + } + else + { + // Fill with zero and compress + memset( state->buffer + state->uBufferBytes, 0, + state->uBlockLength - state->uBufferBytes ); + state->k = _mm_add_epi64( state->k, remainingbits ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1 ); + + // Last block + memset( state->buffer, 0, state->uBlockLength - 18 ); + + // Hash size + *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = + state->uHashSize; + + // Processed bits + *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = + state->processed_bits; + *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; + // Compress the last block + state->k = _mm_xor_si128( state->k, state->k ); + state->k = _mm_sub_epi64( state->k, state->const1536 ); + Compress( state, state->buffer, 1) ; + } + + // Store the hash value + _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] ); + _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] ); + + if( state->uHashSize == 512 ) + { + _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] ); + _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] ); + + } + return SUCCESS; +} + + HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) { diff --git a/algo/echo/aes_ni/hash.c.test b/algo/echo/aes_ni/hash.c.test deleted file mode 100644 index 41d5a57..0000000 --- a/algo/echo/aes_ni/hash.c.test +++ /dev/null @@ -1,620 +0,0 @@ -/* - * file : echo_vperm.c - * version : 1.0.208 - * date : 14.12.2010 - * - * - vperm and aes_ni implementations of hash function ECHO - * - implements NIST hash api - * - assumes that message lenght is multiple of 8-bits - * - _ECHO_VPERM_ must be defined if compiling with ../main.c - * - define NO_AES_NI for aes_ni version - * - * Cagdas Calik - * ccalik@metu.edu.tr - * Institute of Applied Mathematics, Middle East Technical University, Turkey. - * - */ -#if defined(__AES__) - -#include -#include "miner.h" -#include "hash_api.h" -//#include "vperm.h" -#include -/* -#ifndef NO_AES_NI -#include -#else -#include -#endif -*/ - -MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F}; -MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC}; -MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1}; -MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C}; -MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1}; -MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8}; -MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09}; -MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79}; -MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8}; -MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170}; -MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1}; -MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363}; -MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6}; -MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b}; -MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e}; -MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e}; -MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515}; -MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c}; -MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601}; -MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06}; -MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b}; - - -MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000}; -MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000}; -MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101}; -MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c}; -MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000}; -MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234}; - - -#define ECHO_SUBBYTES(state, i, j) \ - state[i][j] = _mm_aesenc_si128(state[i][j], k1);\ - state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\ - k1 = _mm_add_epi32(k1, M128(const1)) - -#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \ - s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\ - t1 = _mm_srli_epi16(state1[0][j], 7);\ - t1 = _mm_and_si128(t1, M128(lsbmask));\ - t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ - s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = s2;\ - state2[1][j] = state1[0][j];\ - state2[2][j] = state1[0][j];\ - state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\ - s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\ - t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\ - t1 = _mm_and_si128(t1, M128(lsbmask));\ - t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ - s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\ - state2[1][j] = _mm_xor_si128(state2[1][j], s2);\ - state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\ - state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\ - s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\ - t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\ - t1 = _mm_and_si128(t1, M128(lsbmask));\ - t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ - s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\ - state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\ - state2[2][j] = _mm_xor_si128(state2[2][j], s2);\ - state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\ - s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\ - t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\ - t1 = _mm_and_si128(t1, M128(lsbmask));\ - t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\ - s2 = _mm_xor_si128(s2, t2);\ - state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\ - state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\ - state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\ - state2[3][j] = _mm_xor_si128(state2[3][j], s2) - - -#define ECHO_ROUND_UNROLL2 \ - ECHO_SUBBYTES(_state, 0, 0);\ - ECHO_SUBBYTES(_state, 1, 0);\ - ECHO_SUBBYTES(_state, 2, 0);\ - ECHO_SUBBYTES(_state, 3, 0);\ - ECHO_SUBBYTES(_state, 0, 1);\ - ECHO_SUBBYTES(_state, 1, 1);\ - ECHO_SUBBYTES(_state, 2, 1);\ - ECHO_SUBBYTES(_state, 3, 1);\ - ECHO_SUBBYTES(_state, 0, 2);\ - ECHO_SUBBYTES(_state, 1, 2);\ - ECHO_SUBBYTES(_state, 2, 2);\ - ECHO_SUBBYTES(_state, 3, 2);\ - ECHO_SUBBYTES(_state, 0, 3);\ - ECHO_SUBBYTES(_state, 1, 3);\ - ECHO_SUBBYTES(_state, 2, 3);\ - ECHO_SUBBYTES(_state, 3, 3);\ - ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\ - ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\ - ECHO_SUBBYTES(_state2, 0, 0);\ - ECHO_SUBBYTES(_state2, 1, 0);\ - ECHO_SUBBYTES(_state2, 2, 0);\ - ECHO_SUBBYTES(_state2, 3, 0);\ - ECHO_SUBBYTES(_state2, 0, 1);\ - ECHO_SUBBYTES(_state2, 1, 1);\ - ECHO_SUBBYTES(_state2, 2, 1);\ - ECHO_SUBBYTES(_state2, 3, 1);\ - ECHO_SUBBYTES(_state2, 0, 2);\ - ECHO_SUBBYTES(_state2, 1, 2);\ - ECHO_SUBBYTES(_state2, 2, 2);\ - ECHO_SUBBYTES(_state2, 3, 2);\ - ECHO_SUBBYTES(_state2, 0, 3);\ - ECHO_SUBBYTES(_state2, 1, 3);\ - ECHO_SUBBYTES(_state2, 2, 3);\ - ECHO_SUBBYTES(_state2, 3, 3);\ - ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\ - ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2) - - - -#define SAVESTATE(dst, src)\ - dst[0][0] = src[0][0];\ - dst[0][1] = src[0][1];\ - dst[0][2] = src[0][2];\ - dst[0][3] = src[0][3];\ - dst[1][0] = src[1][0];\ - dst[1][1] = src[1][1];\ - dst[1][2] = src[1][2];\ - dst[1][3] = src[1][3];\ - dst[2][0] = src[2][0];\ - dst[2][1] = src[2][1];\ - dst[2][2] = src[2][2];\ - dst[2][3] = src[2][3];\ - dst[3][0] = src[3][0];\ - dst[3][1] = src[3][1];\ - dst[3][2] = src[3][2];\ - dst[3][3] = src[3][3] - - -void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount) -{ - unsigned int r, b, i, j; - __m128i t1, t2, s2, k1; - __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; - - for(i = 0; i < 4; i++) - for(j = 0; j < ctx->uHashSize / 256; j++) - _state[i][j] = ctx->state[i][j]; - - for(b = 0; b < uBlockCount; b++) - { - ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); - - // load message - for(j = ctx->uHashSize / 256; j < 4; j++) - { - for(i = 0; i < 4; i++) - { - _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); - } - } - -uint64_t *b = (uint64_t*)_state; -//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); - - // save state - SAVESTATE(_statebackup, _state); - - k1 = ctx->k; - - for(r = 0; r < ctx->uRounds / 2; r++) - { - ECHO_ROUND_UNROLL2; - } - -//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); - - - if(ctx->uHashSize == 256) - { - for(i = 0; i < 4; i++) - { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); - } - } - else - { - for(i = 0; i < 4; i++) - { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); - } - } - pmsg += ctx->uBlockLength; - } - SAVESTATE(ctx->state, _state); - -} - - - -HashReturn init_echo(hashState_echo *ctx, int nHashSize) -{ - int i, j; - - ctx->k = _mm_setzero_si128(); - ctx->processed_bits = 0; - ctx->uBufferBytes = 0; - - switch(nHashSize) - { - case 256: - ctx->uHashSize = 256; - ctx->uBlockLength = 192; - ctx->uRounds = 8; - ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100); - ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600); - break; - - case 512: - ctx->uHashSize = 512; - ctx->uBlockLength = 128; - ctx->uRounds = 10; - ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200); - ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400); - break; - - default: - return BAD_HASHBITLEN; - } - - - for(i = 0; i < 4; i++) - for(j = 0; j < nHashSize / 256; j++) - ctx->state[i][j] = ctx->hashsize; - - for(i = 0; i < 4; i++) - for(j = nHashSize / 256; j < 4; j++) - ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0); - - return SUCCESS; -} - -HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen) -{ - unsigned int uByteLength, uBlockCount, uRemainingBytes; - - uByteLength = (unsigned int)(databitlen / 8); - - if((state->uBufferBytes + uByteLength) >= state->uBlockLength) - { - if(state->uBufferBytes != 0) - { - // Fill the buffer - memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes); - - // Process buffer - Compress(state, state->buffer, 1); - state->processed_bits += state->uBlockLength * 8; - - data += state->uBlockLength - state->uBufferBytes; - uByteLength -= state->uBlockLength - state->uBufferBytes; - } - - // buffer now does not contain any unprocessed bytes - - uBlockCount = uByteLength / state->uBlockLength; - uRemainingBytes = uByteLength % state->uBlockLength; - - if(uBlockCount > 0) - { - Compress(state, data, uBlockCount); - - state->processed_bits += uBlockCount * state->uBlockLength * 8; - data += uBlockCount * state->uBlockLength; - } - - if(uRemainingBytes > 0) - { - memcpy(state->buffer, (void*)data, uRemainingBytes); - } - - state->uBufferBytes = uRemainingBytes; - } - else - { - memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength); - state->uBufferBytes += uByteLength; - } - - return SUCCESS; -} - -HashReturn final_echo(hashState_echo *state, BitSequence *hashval) -{ - __m128i remainingbits; - - // Add remaining bytes in the buffer - state->processed_bits += state->uBufferBytes * 8; - - remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8); - - // Pad with 0x80 - state->buffer[state->uBufferBytes++] = 0x80; - - // Enough buffer space for padding in this block? - if((state->uBlockLength - state->uBufferBytes) >= 18) - { - // Pad with zeros - memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18)); - - // Hash size - *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; - - // Processed bits - *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; - *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; - - // Last block contains message bits? - if(state->uBufferBytes == 1) - { - state->k = _mm_xor_si128(state->k, state->k); - state->k = _mm_sub_epi64(state->k, state->const1536); - } - else - { - state->k = _mm_add_epi64(state->k, remainingbits); - state->k = _mm_sub_epi64(state->k, state->const1536); - } - - // Compress - Compress(state, state->buffer, 1); - } - else - { - // Fill with zero and compress - memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes); - state->k = _mm_add_epi64(state->k, remainingbits); - state->k = _mm_sub_epi64(state->k, state->const1536); - Compress(state, state->buffer, 1); - - // Last block - memset(state->buffer, 0, state->uBlockLength - 18); - - // Hash size - *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; - - // Processed bits - *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; - *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; - - // Compress the last block - state->k = _mm_xor_si128(state->k, state->k); - state->k = _mm_sub_epi64(state->k, state->const1536); - Compress(state, state->buffer, 1); - } - - // Store the hash value - _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); - _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); - - if(state->uHashSize == 512) - { - _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); - _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); - } - - return SUCCESS; -} - -HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ) -{ - unsigned int uByteLength, uBlockCount, uRemainingBytes; - - uByteLength = (unsigned int)(databitlen / 8); - -/* - if( (state->uBufferBytes + uByteLength) >= state->uBlockLength ) - { -printf("full block\n"); - if( state->uBufferBytes != 0 ) - { - // Fill the buffer - memcpy( state->buffer + state->uBufferBytes, - (void*)data, state->uBlockLength - state->uBufferBytes ); - - // Process buffer - Compress( state, state->buffer, 1 ); - state->processed_bits += state->uBlockLength * 8; - - data += state->uBlockLength - state->uBufferBytes; - uByteLength -= state->uBlockLength - state->uBufferBytes; - } - - // buffer now does not contain any unprocessed bytes - - uBlockCount = uByteLength / state->uBlockLength; - uRemainingBytes = uByteLength % state->uBlockLength; - - if( uBlockCount > 0 ) - { - Compress( state, data, uBlockCount ); - state->processed_bits += uBlockCount * state->uBlockLength * 8; - data += uBlockCount * state->uBlockLength; - } - - if( uRemainingBytes > 0 ) - memcpy(state->buffer, (void*)data, uRemainingBytes); - - state->uBufferBytes = uRemainingBytes; - } - else - { -*/ - memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength ); - state->uBufferBytes += uByteLength; -// } - - __m128i remainingbits; - - // Add remaining bytes in the buffer - state->processed_bits += state->uBufferBytes * 8; - - remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 ); - - // Pad with 0x80 - state->buffer[state->uBufferBytes++] = 0x80; - - // Enough buffer space for padding in this block? - -// if( (state->uBlockLength - state->uBufferBytes) >= 18 ) -// { - // Pad with zeros - - memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) ); - - // Hash size - *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize; - - // Processed bits - *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = - state->processed_bits; - *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; - - - // Last block contains message bits? - if( state->uBufferBytes == 1 ) - { - state->k = _mm_xor_si128( state->k, state->k ); - state->k = _mm_sub_epi64( state->k, state->const1536 ); - } - else - { - state->k = _mm_add_epi64( state->k, remainingbits ); - state->k = _mm_sub_epi64( state->k, state->const1536 ); - } - -uint64_t *b = (uint64_t*)&state->k; -/* -printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); -b = (uint64_t*)state->buffer; -printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); -printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]); -printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]); -printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]); - -b = (uint64_t*)state->state; -printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); -printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]); -printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]); -printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]); -*/ - // Compress - Compress( state, state->buffer, 1 ); - -//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]); - - -/* - } - else - { - // Fill with zero and compress - memset( state->buffer + state->uBufferBytes, 0, - state->uBlockLength - state->uBufferBytes ); - state->k = _mm_add_epi64( state->k, remainingbits ); - state->k = _mm_sub_epi64( state->k, state->const1536 ); - Compress( state, state->buffer, 1 ); - - // Last block - memset( state->buffer, 0, state->uBlockLength - 18 ); - - // Hash size - *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = - state->uHashSize; - - // Processed bits - *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) = - state->processed_bits; - *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0; - // Compress the last block - state->k = _mm_xor_si128( state->k, state->k ); - state->k = _mm_sub_epi64( state->k, state->const1536 ); - Compress( state, state->buffer, 1) ; - } -*/ - - // Store the hash value - _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] ); - _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] ); - - if( state->uHashSize == 512 ) - { - _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] ); - _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] ); - - } - return SUCCESS; -} - - -HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) -{ - HashReturn hRet; - hashState_echo hs; - - ///// - /* - __m128i a, b, c, d, t[4], u[4], v[4]; - - a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100); - b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110); - c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120); - d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130); - - t[0] = _mm_unpacklo_epi8(a, b); - t[1] = _mm_unpackhi_epi8(a, b); - t[2] = _mm_unpacklo_epi8(c, d); - t[3] = _mm_unpackhi_epi8(c, d); - - u[0] = _mm_unpacklo_epi16(t[0], t[2]); - u[1] = _mm_unpackhi_epi16(t[0], t[2]); - u[2] = _mm_unpacklo_epi16(t[1], t[3]); - u[3] = _mm_unpackhi_epi16(t[1], t[3]); - - - t[0] = _mm_unpacklo_epi16(u[0], u[1]); - t[1] = _mm_unpackhi_epi16(u[0], u[1]); - t[2] = _mm_unpacklo_epi16(u[2], u[3]); - t[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_unpacklo_epi8(t[0], t[1]); - u[1] = _mm_unpackhi_epi8(t[0], t[1]); - u[2] = _mm_unpacklo_epi8(t[2], t[3]); - u[3] = _mm_unpackhi_epi8(t[2], t[3]); - - a = _mm_unpacklo_epi8(u[0], u[1]); - b = _mm_unpackhi_epi8(u[0], u[1]); - c = _mm_unpacklo_epi8(u[2], u[3]); - d = _mm_unpackhi_epi8(u[2], u[3]); - */ - ///// - - hRet = init_echo(&hs, hashbitlen); - if(hRet != SUCCESS) - return hRet; - - hRet = update_echo(&hs, data, databitlen); - if(hRet != SUCCESS) - return hRet; - - hRet = final_echo(&hs, hashval); - if(hRet != SUCCESS) - return hRet; - - return SUCCESS; -} - -#endif diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h index 01e5598..a550088 100644 --- a/algo/echo/aes_ni/hash_api.h +++ b/algo/echo/aes_ni/hash_api.h @@ -15,7 +15,7 @@ #ifndef HASH_API_H #define HASH_API_H -#ifndef NO_AES_NI +#ifdef __AES__ #define HASH_IMPL_STR "ECHO-aesni" #else #define HASH_IMPL_STR "ECHO-vperm" @@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval, const BitSequence *data, DataLength databitlen ); +HashReturn echo_full( hashState_echo *state, BitSequence *hashval, + int nHashSize, const BitSequence *data, DataLength databitlen ); #endif // HASH_API_H diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index 455c58d..57c0a94 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -313,4 +313,92 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval, return 0; } +int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, + const void *data, int datalen ) +{ + int i, j; + int databitlen = datalen * 8; + ctx->k = m512_zero; + ctx->processed_bits = 0; + ctx->uBufferBytes = 0; + + switch( nHashSize ) + { + case 256: + ctx->uHashSize = 256; + ctx->uBlockLength = 192; + ctx->uRounds = 8; + ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 ); + ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 ); + break; + + case 512: + ctx->uHashSize = 512; + ctx->uBlockLength = 128; + ctx->uRounds = 10; + ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 ); + ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400); + break; + + default: + return 1; + } + + for( i = 0; i < 4; i++ ) + for( j = 0; j < nHashSize / 256; j++ ) + ctx->state[ i ][ j ] = ctx->hashsize; + + for( i = 0; i < 4; i++ ) + for( j = nHashSize / 256; j < 4; j++ ) + ctx->state[ i ][ j ] = m512_zero; + + +// bytelen is either 32 (maybe), 64 or 80 or 128! +// all are less than full block. + + int vlen = datalen / 32; + const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane + __m512i remainingbits; + + if ( databitlen == 1024 ) + { + echo_4way_compress( ctx, data, 1 ); + ctx->processed_bits = 1024; + remainingbits = m512_const2_64( 0, -1024 ); + vlen = 0; + } + else + { + vlen = databitlen / 128; // * 4 lanes / 128 bits per lane + memcpy_512( ctx->buffer, data, vlen ); + ctx->processed_bits += (unsigned int)( databitlen ); + remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen ); + + } + + ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); + memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 ); + ctx->buffer[ vblen-2 ] = + _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 ); + ctx->buffer[ vblen-1 ] = + _mm512_set4_epi64( 0, ctx->processed_bits, + 0, ctx->processed_bits ); + + ctx->k = _mm512_add_epi64( ctx->k, remainingbits ); + ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 ); + + echo_4way_compress( ctx, ctx->buffer, 1 ); + + _mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] ); + _mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] ); + + if ( ctx->uHashSize == 512 ) + { + _mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] ); + _mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] ); + } + return 0; +} + + #endif diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h index eb5bc03..014c789 100644 --- a/algo/echo/echo-hash-4way.h +++ b/algo/echo/echo-hash-4way.h @@ -32,5 +32,8 @@ int echo_close( echo_4way_context *state, void *hashval ); int echo_4way_update_close( echo_4way_context *state, void *hashval, const void *data, int databitlen ); +int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, + const void *data, int datalen ); + #endif #endif diff --git a/algo/groestl/aes_ni/groestl-version.h b/algo/groestl/aes_ni/groestl-version.h index 5742f3d..26736ec 100644 --- a/algo/groestl/aes_ni/groestl-version.h +++ b/algo/groestl/aes_ni/groestl-version.h @@ -2,13 +2,6 @@ //#define TASM #define TINTR -//#define AES_NI - -//#ifdef AES_NI -// specify AES-NI, AVX (with AES-NI) or vector-permute implementation - -//#ifndef NO_AES_NI - // Not to be confused with AVX512VAES #define VAES // #define VAVX diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c index 8303559..cf680e4 100644 --- a/algo/groestl/aes_ni/hash-groestl.c +++ b/algo/groestl/aes_ni/hash-groestl.c @@ -14,7 +14,7 @@ #include "miner.h" #include "simd-utils.h" -#ifndef NO_AES_NI +#ifdef __AES__ #include "groestl-version.h" @@ -91,8 +91,9 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx ) ctx->chaining[i] = _mm_setzero_si128(); ctx->buffer[i] = _mm_setzero_si128(); } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT(ctx->chaining); + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); +// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); +// INIT(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -184,6 +185,82 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) return SUCCESS_GR; } +int groestl512_full( hashState_groestl* ctx, void* output, + const void* input, uint64_t databitlen ) +{ + + int i; + + ctx->hashlen = 64; + SET_CONSTANTS(); + + for ( i = 0; i < SIZE512; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + + const int len = (int)databitlen / 128; + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + int rem = ctx->rem_ptr; + uint64_t blocks = len / SIZE512; + __m128i* in = (__m128i*)input; + + // --- update --- + + // digest any full blocks, process directly from input + for ( i = 0; i < blocks; i++ ) + TF1024( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final + + //--- final --- + + blocks++; // adjust for final block + + if ( i == len -1 ) + { + // only 128 bits left in buffer, all padding at once + ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + } + else + { + // add first padding + ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0x80 ); + // add zero padding + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = _mm_setzero_si128(); + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, + 0, 0 ,0,0, 0,0,0,0 ); + } + + // digest final padding block and do output transform + TF1024( ctx->chaining, ctx->buffer ); + + OF1024( ctx->chaining ); + + // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + + HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, const void* input, DataLength_gr databitlen ) { diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index b537490..595dc3d 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* ); HashReturn_gr update_and_final_groestl( hashState_groestl*, void*, const void*, DataLength_gr ); +int groestl512_full( hashState_groestl*, void*, const void*, uint64_t ); #endif /* __hash_h */ diff --git a/algo/groestl/aes_ni/hash-groestl256.c b/algo/groestl/aes_ni/hash-groestl256.c index cee3eac..ac6e5f5 100644 --- a/algo/groestl/aes_ni/hash-groestl256.c +++ b/algo/groestl/aes_ni/hash-groestl256.c @@ -11,7 +11,7 @@ #include "miner.h" #include "simd-utils.h" -#ifndef NO_AES_NI +#ifdef __AES__ #include "groestl-version.h" @@ -86,8 +86,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx) ctx->chaining[i] = _mm_setzero_si128(); ctx->buffer[i] = _mm_setzero_si128(); } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT256(ctx->chaining); + + ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 ); + +// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); +// INIT256(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index f82c1de..b228d3f 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -93,9 +93,6 @@ typedef enum typedef struct { __attribute__ ((aligned (32))) __m128i chaining[SIZE256]; __attribute__ ((aligned (32))) __m128i buffer[SIZE256]; -// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ -// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */ -// u64 block_counter; /* message block counter */ int hashlen; // bytes int blk_count; int buf_ptr; /* data buffer pointer */ diff --git a/algo/groestl/groestl-4way.c b/algo/groestl/groestl-4way.c index b545146..9c02153 100644 --- a/algo/groestl/groestl-4way.c +++ b/algo/groestl/groestl-4way.c @@ -49,7 +49,7 @@ int scanhash_groestl_4way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash+(lane<<3) )[7] < Htarg ) + if ( ( hash+(lane<<3) )[7] <= Htarg ) if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark ) { pdata[19] = n + lane; diff --git a/algo/groestl/groestl.c b/algo/groestl/groestl.c index 517dfb8..d4fc2b7 100644 --- a/algo/groestl/groestl.c +++ b/algo/groestl/groestl.c @@ -3,19 +3,18 @@ #include #include #include - -#ifdef NO_AES_NI - #include "sph_groestl.h" -#else +#ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "sph_groestl.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl1, groestl2; -#else +#ifdef __AES__ hashState_groestl groestl1, groestl2; +#else + sph_groestl512_context groestl1, groestl2; #endif } groestl_ctx_holder; @@ -24,12 +23,12 @@ static groestl_ctx_holder groestl_ctx; void init_groestl_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init( &groestl_ctx.groestl1 ); - sph_groestl512_init( &groestl_ctx.groestl2 ); -#else +#ifdef __AES__ init_groestl( &groestl_ctx.groestl1, 64 ); init_groestl( &groestl_ctx.groestl2, 64 ); +#else + sph_groestl512_init( &groestl_ctx.groestl1 ); + sph_groestl512_init( &groestl_ctx.groestl2 ); #endif } @@ -39,18 +38,18 @@ void groestlhash( void *output, const void *input ) groestl_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) ); -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl1, input, 80); - sph_groestl512_close(&ctx.groestl1, hash); - - sph_groestl512(&ctx.groestl2, hash, 64); - sph_groestl512_close(&ctx.groestl2, hash); -#else +#ifdef __AES__ update_and_final_groestl( &ctx.groestl1, (char*)hash, (const char*)input, 640 ); update_and_final_groestl( &ctx.groestl2, (char*)hash, (const char*)hash, 512 ); +#else + sph_groestl512(&ctx.groestl1, input, 80); + sph_groestl512_close(&ctx.groestl1, hash); + + sph_groestl512(&ctx.groestl2, hash, 64); + sph_groestl512_close(&ctx.groestl2, hash); #endif memcpy(output, hash, 32); } diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c index cee3eac..48c39bf 100644 --- a/algo/groestl/groestl256-hash-4way.c +++ b/algo/groestl/groestl256-hash-4way.c @@ -1,4 +1,5 @@ /* hash.c Aug 2011 + * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12. * * Groestl implementation for different versions. * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer @@ -6,51 +7,18 @@ * This code is placed in the public domain */ +// Optimized for hash and data length that are integrals of __m128i + + #include -#include "hash-groestl256.h" +#include "groestl256-intr-4way.h" #include "miner.h" #include "simd-utils.h" -#ifndef NO_AES_NI +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -#include "groestl-version.h" -#ifdef TASM - #ifdef VAES - #include "groestl256-asm-aes.h" - #else - #ifdef VAVX - #include "groestl256-asm-avx.h" - #else - #ifdef VVPERM - #include "groestl256-asm-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif -#else - #ifdef TINTR - #ifdef VAES - #include "groestl256-intr-aes.h" - #else - #ifdef VAVX - #include "groestl256-intr-avx.h" - #else - #ifdef VVPERM - #include "groestl256-intr-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif - #else - #error NO TYPE SPECIFIED (-DT[ASM/INTR]) - #endif -#endif - -/* initialise context */ -HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) +int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen ) { int i; @@ -58,223 +26,84 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) - return FAIL_GR; + return 1; for ( i = 0; i < SIZE256; i++ ) { - ctx->chaining[i] = _mm_setzero_si128(); - ctx->buffer[i] = _mm_setzero_si128(); + ctx->chaining[i] = m512_zero; + ctx->buffer[i] = m512_zero; } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT256( ctx->chaining ); + + // The only non-zero in the IV is len. It can be hard coded. + ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 ); +// uint64_t len = U64BIG((uint64_t)LENGTH); +// ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 ); +// INIT256_4way(ctx->chaining); + ctx->buf_ptr = 0; ctx->rem_ptr = 0; - return SUCCESS_GR; + return 0; } - -HashReturn_gr reinit_groestl256(hashState_groestl256* ctx) - { - int i; - - if (ctx->chaining == NULL || ctx->buffer == NULL) - return FAIL_GR; - - for ( i = 0; i < SIZE256; i++ ) - { - ctx->chaining[i] = _mm_setzero_si128(); - ctx->buffer[i] = _mm_setzero_si128(); - } - ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); - INIT256(ctx->chaining); - ctx->buf_ptr = 0; - ctx->rem_ptr = 0; - - return SUCCESS_GR; -} - -// Use this only for midstate and never for cryptonight -HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input, - DataLength_gr databitlen ) -{ - __m128i* in = (__m128i*)input; - const int len = (int)databitlen / 128; // bits to __m128i - const int blocks = len / SIZE256; // __M128i to blocks - int rem = ctx->rem_ptr; - int i; - - ctx->blk_count = blocks; - ctx->databitlen = databitlen; - - // digest any full blocks - for ( i = 0; i < blocks; i++ ) - TF512( ctx->chaining, &in[ i * SIZE256 ] ); - // adjust buf_ptr to last block - ctx->buf_ptr = blocks * SIZE256; - - // Copy any remainder to buffer - for ( i = 0; i < len % SIZE256; i++ ) - ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; - // adjust rem_ptr for new data - ctx->rem_ptr += i; - - return SUCCESS_GR; -} - -// don't use this at all -HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output ) -{ - const int len = (int)ctx->databitlen / 128; // bits to __m128i - const int blocks = ctx->blk_count + 1; // adjust for final block - const int rem_ptr = ctx->rem_ptr; // end of data start of padding - const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i - const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer - int i; - - // first pad byte = 0x80, last pad byte = block count - // everything in between is zero - - if ( rem_ptr == len - 1 ) - { - // all padding at once - ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); - } - else - { - // add first padding - ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); - // add zero padding - for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ ) - ctx->buffer[i] = _mm_setzero_si128(); - // add length padding - // cheat since we know the block count is trivial, good if block < 256 - ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0 ); - } - - // digest final padding block and do output transform - TF512( ctx->chaining, ctx->buffer ); - OF512( ctx->chaining ); - - // store hash result in output - for ( i = 0; i < hashlen_m128i; i++ ) - casti_m128i( output, i ) = ctx->chaining[ hash_offset + i]; - - return SUCCESS_GR; -} - -HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx, - void* output, const void* input, DataLength_gr databitlen ) +int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output, + const void* input, uint64_t databitlen ) { const int len = (int)databitlen / 128; const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE256 - hashlen_m128i; int rem = ctx->rem_ptr; int blocks = len / SIZE256; - __m128i* in = (__m128i*)input; + __m512i* in = (__m512i*)input; int i; // --- update --- // digest any full blocks, process directly from input for ( i = 0; i < blocks; i++ ) - TF512( ctx->chaining, &in[ i * SIZE256 ] ); + TF512_4way( ctx->chaining, &in[ i * SIZE256 ] ); ctx->buf_ptr = blocks * SIZE256; - // cryptonight has 200 byte input, an odd number of __m128i - // remainder is only 8 bytes, ie u64. - if ( databitlen % 128 !=0 ) - { - // must be cryptonight, copy 64 bits of data - *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] ); - i = -1; // signal for odd length - } - else - { - // Copy any remaining data to buffer for final transform - for ( i = 0; i < len % SIZE256; i++ ) - ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; - i += rem; // use i as rem_ptr in final - } + // copy any remaining data to buffer, it may already contain data + // from a previous update for a midstate precalc + for ( i = 0; i < len % SIZE256; i++ ) + ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; + i += rem; // use i as rem_ptr in final //--- final --- - // adjust for final block - blocks++; + blocks++; // adjust for final block - if ( i == len - 1 ) - { - // all padding at once - ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, - 0, 0,0,0, 0,0,0,0x80 ); - } + if ( i == SIZE256 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m512_const1_128( _mm_set_epi8( + blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) ); + } else { - if ( i == -1 ) - { - // cryptonight odd length - ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull; - // finish the block with zero and length padding as normal - i = 0; - } - else - { - // add first padding - ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); - } + // add first padding + ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE256 - 1; i++ ) - ctx->buffer[i] = _mm_setzero_si128(); - // add length padding - // cheat since we know the block count is trivial, good if block < 256 - ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0, - 0, 0,0,0, 0,0,0,0 ); + ctx->buffer[i] = m512_zero; + + // add length padding, second last byte is zero unless blocks > 255 + ctx->buffer[i] = m512_const1_128( _mm_set_epi8( + blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); } - // digest final padding block and do output transform - TF512( ctx->chaining, ctx->buffer ); - OF512( ctx->chaining ); +// digest final padding block and do output transform + TF512_4way( ctx->chaining, ctx->buffer ); + + OF512_4way( ctx->chaining ); // store hash result in output for ( i = 0; i < hashlen_m128i; i++ ) - casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; + casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; - return SUCCESS_GR; + return 0; } -/* hash bit sequence */ -HashReturn_gr hash_groestl256(int hashbitlen, - const BitSequence_gr* data, - DataLength_gr databitlen, - BitSequence_gr* hashval) { - HashReturn_gr ret; - hashState_groestl256 context; +#endif // VAES - /* initialise */ - if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR) - return ret; - - /* process message */ - if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR) - return ret; - - /* finalise */ - ret = final_groestl256(&context, hashval); - - return ret; -} - -/* eBash API */ -//#ifdef crypto_hash_BYTES -//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) -//{ -// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0; -// return -1; -//} -//#endif - -#endif diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h index f82c1de..7e49b21 100644 --- a/algo/groestl/groestl256-hash-4way.h +++ b/algo/groestl/groestl256-hash-4way.h @@ -6,56 +6,39 @@ * This code is placed in the public domain */ -#ifndef __hash_h -#define __hash_h +#if !defined(GROESTL256_HASH_4WAY_H__) +#define GROESTL256_HASH_4WAY_H__ 1 +#include "simd-utils.h" #include +#include #include #if defined(_WIN64) || defined(__WINDOWS__) #include #endif #include -/* eBash API begin */ -/* -#include "crypto_hash.h" -#ifdef crypto_hash_BYTES - -#include -#include -#include -typedef crypto_uint8 u8; -typedef crypto_uint32 u32; -typedef crypto_uint64 u64; -#endif - */ -/* eBash API end */ - -//#define LENGTH (512) - -#include "brg_endian.h" -#define NEED_UINT_64T -#include "algo/sha/brg_types.h" - -#ifdef IACA_TRACE - #include IACA_MARKS -#endif - +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + #define LENGTH (256) +//#include "brg_endian.h" +//#define NEED_UINT_64T +//#include "algo/sha/brg_types.h" + /* some sizes (number of bytes) */ #define ROWS (8) #define LENGTHFIELDLEN (ROWS) #define COLS512 (8) //#define COLS1024 (16) #define SIZE_512 ((ROWS)*(COLS512)) -//#define SIZE1024 ((ROWS)*(COLS1024)) +//#define SIZE_1024 ((ROWS)*(COLS1024)) #define ROUNDS512 (10) //#define ROUNDS1024 (14) //#if LENGTH<=256 #define COLS (COLS512) -//#define SIZE (SIZE512) +#define SIZE (SIZE512) #define ROUNDS (ROUNDS512) //#else //#define COLS (COLS1024) @@ -63,59 +46,30 @@ typedef crypto_uint64 u64; //#define ROUNDS (ROUNDS1024) //#endif -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) -#define U64BIG(a) (a) -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define U64BIG(a) \ - ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ - (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ - (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ - (ROTL64(a,56) & li_64(FF000000FF000000))) -#endif /* IS_LITTLE_ENDIAN */ - -typedef unsigned char BitSequence_gr; -typedef unsigned long long DataLength_gr; -typedef enum -{ - SUCCESS_GR = 0, - FAIL_GR = 1, - BAD_HASHBITLEN_GR = 2 -} HashReturn_gr; - #define SIZE256 (SIZE_512/16) typedef struct { - __attribute__ ((aligned (32))) __m128i chaining[SIZE256]; - __attribute__ ((aligned (32))) __m128i buffer[SIZE256]; -// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */ -// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */ -// u64 block_counter; /* message block counter */ - int hashlen; // bytes - int blk_count; - int buf_ptr; /* data buffer pointer */ + __attribute__ ((aligned (128))) __m512i chaining[SIZE256]; + __attribute__ ((aligned (64))) __m512i buffer[SIZE256]; + int hashlen; // byte + int blk_count; // SIZE_m128i + int buf_ptr; // __m128i offset int rem_ptr; - int databitlen; -} hashState_groestl256; + int databitlen; // bits +} groestl256_4way_context; -HashReturn_gr init_groestl256( hashState_groestl256*, int ); -HashReturn_gr reinit_groestl256( hashState_groestl256* ); +int groestl256_4way_init( groestl256_4way_context*, uint64_t ); -HashReturn_gr update_groestl256( hashState_groestl256*, const void*, - DataLength_gr ); +//int reinit_groestl( hashState_groestl* ); -HashReturn_gr final_groestl256( hashState_groestl256*, void* ); +//int groestl512_4way_update( groestl256_4way_context*, const void*, +// uint64_t ); -HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr, - BitSequence_gr* ); +//int groestl512_4way_close( groestl512_4way_context*, void* ); -HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*, - const void*, DataLength_gr ); +int groestl256_4way_update_close( groestl256_4way_context*, void*, + const void*, uint64_t ); -#endif /* __hash_h */ +#endif +#endif diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h index 57dd930..ef7719c 100644 --- a/algo/groestl/groestl256-intr-4way.h +++ b/algo/groestl/groestl256-intr-4way.h @@ -7,35 +7,37 @@ * This code is placed in the public domain */ -#include -#include -#include "hash-groestl256.h" + +#if !defined(GROESTL256_INTR_4WAY_H__) +#define GROESTL256_INTR_4WAY_H__ 1 + +#include "groestl256-hash-4way.h" + +#if defined(__VAES__) /* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -//__m128i ROUND_CONST_P[ROUNDS1024]; -//__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_1B; -__m128i ALL_FF; - +__m512i ROUND_CONST_Lx; +__m512i ROUND_CONST_L0[ROUNDS512]; +__m512i ROUND_CONST_L7[ROUNDS512]; +//__m512i ROUND_CONST_P[ROUNDS1024]; +//__m512i ROUND_CONST_Q[ROUNDS1024]; +__m512i TRANSP_MASK; +__m512i SUBSH_MASK[8]; +__m512i ALL_1B; +__m512i ALL_FF; #define tos(a) #a #define tostr(a) tos(a) - /* xmm[i] will be multiplied by 2 * xmm[j] will be lost * xmm[k] has to be all 0x1b */ #define MUL2(i, j, k){\ - j = _mm_xor_si128(j, j);\ - j = _mm_cmpgt_epi8(j, i);\ - i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ + j = _mm512_xor_si512(j, j);\ + j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\ + i = _mm512_add_epi8(i, i);\ + j = _mm512_and_si512(j, k);\ + i = _mm512_xor_si512(i, j);\ } /**/ @@ -61,152 +63,188 @@ __m128i ALL_FF; /* t_i = a_i + a_{i+1} */\ b6 = a0;\ b7 = a1;\ - a0 = _mm_xor_si128(a0, a1);\ + a0 = _mm512_xor_si512(a0, a1);\ b0 = a2;\ - a1 = _mm_xor_si128(a1, a2);\ + a1 = _mm512_xor_si512(a1, a2);\ b1 = a3;\ - a2 = _mm_xor_si128(a2, a3);\ + a2 = _mm512_xor_si512(a2, a3);\ b2 = a4;\ - a3 = _mm_xor_si128(a3, a4);\ + a3 = _mm512_xor_si512(a3, a4);\ b3 = a5;\ - a4 = _mm_xor_si128(a4, a5);\ + a4 = _mm512_xor_si512(a4, a5);\ b4 = a6;\ - a5 = _mm_xor_si128(a5, a6);\ + a5 = _mm512_xor_si512(a5, a6);\ b5 = a7;\ - a6 = _mm_xor_si128(a6, a7);\ - a7 = _mm_xor_si128(a7, b6);\ + a6 = _mm512_xor_si512(a6, a7);\ + a7 = _mm512_xor_si512(a7, b6);\ \ /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm_xor_si128(b0, a4);\ - b6 = _mm_xor_si128(b6, a4);\ - b1 = _mm_xor_si128(b1, a5);\ - b7 = _mm_xor_si128(b7, a5);\ - b2 = _mm_xor_si128(b2, a6);\ - b0 = _mm_xor_si128(b0, a6);\ + b0 = _mm512_xor_si512(b0, a4);\ + b6 = _mm512_xor_si512(b6, a4);\ + b1 = _mm512_xor_si512(b1, a5);\ + b7 = _mm512_xor_si512(b7, a5);\ + b2 = _mm512_xor_si512(b2, a6);\ + b0 = _mm512_xor_si512(b0, a6);\ /* spill values y_4, y_5 to memory */\ TEMP0 = b0;\ - b3 = _mm_xor_si128(b3, a7);\ - b1 = _mm_xor_si128(b1, a7);\ + b3 = _mm512_xor_si512(b3, a7);\ + b1 = _mm512_xor_si512(b1, a7);\ TEMP1 = b1;\ - b4 = _mm_xor_si128(b4, a0);\ - b2 = _mm_xor_si128(b2, a0);\ + b4 = _mm512_xor_si512(b4, a0);\ + b2 = _mm512_xor_si512(b2, a0);\ /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ b0 = a0;\ - b5 = _mm_xor_si128(b5, a1);\ - b3 = _mm_xor_si128(b3, a1);\ + b5 = _mm512_xor_si512(b5, a1);\ + b3 = _mm512_xor_si512(b3, a1);\ b1 = a1;\ - b6 = _mm_xor_si128(b6, a2);\ - b4 = _mm_xor_si128(b4, a2);\ + b6 = _mm512_xor_si512(b6, a2);\ + b4 = _mm512_xor_si512(b4, a2);\ TEMP2 = a2;\ - b7 = _mm_xor_si128(b7, a3);\ - b5 = _mm_xor_si128(b5, a3);\ + b7 = _mm512_xor_si512(b7, a3);\ + b5 = _mm512_xor_si512(b5, a3);\ \ /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm_xor_si128(a0, a3);\ - a1 = _mm_xor_si128(a1, a4);\ - a2 = _mm_xor_si128(a2, a5);\ - a3 = _mm_xor_si128(a3, a6);\ - a4 = _mm_xor_si128(a4, a7);\ - a5 = _mm_xor_si128(a5, b0);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, TEMP2);\ + a0 = _mm512_xor_si512(a0, a3);\ + a1 = _mm512_xor_si512(a1, a4);\ + a2 = _mm512_xor_si512(a2, a5);\ + a3 = _mm512_xor_si512(a3, a6);\ + a4 = _mm512_xor_si512(a4, a7);\ + a5 = _mm512_xor_si512(a5, b0);\ + a6 = _mm512_xor_si512(a6, b1);\ + a7 = _mm512_xor_si512(a7, TEMP2);\ \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = ALL_1B;\ + b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\ MUL2(a0, b0, b1);\ - a0 = _mm_xor_si128(a0, TEMP0);\ + a0 = _mm512_xor_si512(a0, TEMP0);\ MUL2(a1, b0, b1);\ - a1 = _mm_xor_si128(a1, TEMP1);\ + a1 = _mm512_xor_si512(a1, TEMP1);\ MUL2(a2, b0, b1);\ - a2 = _mm_xor_si128(a2, b2);\ + a2 = _mm512_xor_si512(a2, b2);\ MUL2(a3, b0, b1);\ - a3 = _mm_xor_si128(a3, b3);\ + a3 = _mm512_xor_si512(a3, b3);\ MUL2(a4, b0, b1);\ - a4 = _mm_xor_si128(a4, b4);\ + a4 = _mm512_xor_si512(a4, b4);\ MUL2(a5, b0, b1);\ - a5 = _mm_xor_si128(a5, b5);\ + a5 = _mm512_xor_si512(a5, b5);\ MUL2(a6, b0, b1);\ - a6 = _mm_xor_si128(a6, b6);\ + a6 = _mm512_xor_si512(a6, b6);\ MUL2(a7, b0, b1);\ - a7 = _mm_xor_si128(a7, b7);\ + a7 = _mm512_xor_si512(a7, b7);\ \ /* compute v_i : double w_i */\ /* add to y_4 y_5 .. v3, v4, ... */\ MUL2(a0, b0, b1);\ - b5 = _mm_xor_si128(b5, a0);\ + b5 = _mm512_xor_si512(b5, a0);\ MUL2(a1, b0, b1);\ - b6 = _mm_xor_si128(b6, a1);\ + b6 = _mm512_xor_si512(b6, a1);\ MUL2(a2, b0, b1);\ - b7 = _mm_xor_si128(b7, a2);\ + b7 = _mm512_xor_si512(b7, a2);\ MUL2(a5, b0, b1);\ - b2 = _mm_xor_si128(b2, a5);\ + b2 = _mm512_xor_si512(b2, a5);\ MUL2(a6, b0, b1);\ - b3 = _mm_xor_si128(b3, a6);\ + b3 = _mm512_xor_si512(b3, a6);\ MUL2(a7, b0, b1);\ - b4 = _mm_xor_si128(b4, a7);\ + b4 = _mm512_xor_si512(b4, a7);\ MUL2(a3, b0, b1);\ MUL2(a4, b0, b1);\ b0 = TEMP0;\ b1 = TEMP1;\ - b0 = _mm_xor_si128(b0, a3);\ - b1 = _mm_xor_si128(b1, a4);\ + b0 = _mm512_xor_si512(b0, a3);\ + b1 = _mm512_xor_si512(b1, a4);\ }/*MixBytes*/ -#define SET_CONSTANTS(){\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ - SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ - SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ - SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}while(0); \ +// calculate the round constants seperately and load at startup + +#define SET_CONSTANTS(){\ + ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\ + TRANSP_MASK = _mm512_set_epi32( \ + 0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \ + 0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \ + 0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \ + 0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \ + SUBSH_MASK[0] = _mm512_set_epi32( \ + 0x33363a3d, 0x38323539, 0x3c3f3134, 0x373b3e30, \ + 0x23262a2d, 0x28222529, 0x2c2f2124, 0x272b2e20, \ + 0x13161a1d, 0x18121519, 0x1c1f1114, 0x171b1e10, \ + 0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00 ); \ + SUBSH_MASK[1] = _mm512_set_epi32( \ + 0x34373c3f, 0x3a33363b, 0x3e393235, 0x303d3831, \ + 0x24272c2f, 0x2a23262b, 0x2e292225, 0x202d2821, \ + 0x14171c1f, 0x1a13161b, 0x1e191215, 0x101d1801, \ + 0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801 );\ + SUBSH_MASK[2] = _mm512_set_epi32( \ + 0x35303e39, 0x3c34373d, 0x383b3336, 0x313f3a32, \ + 0x25202e29, 0x2c24272d, 0x282b2326, 0x212f2a22, \ + 0x15101e19, 0x1c14171d, 0x181b1316, 0x111f1a12, \ + 0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02 );\ + SUBSH_MASK[3] = _mm512_set_epi32( \ + 0x3631383b, 0x3e35303f, 0x3a3d3437, 0x32393c33, \ + 0x2621282b, 0x2e25202f, 0x2a2d2427, 0x22292c23, \ + 0x1611181b, 0x1e15101f, 0x1a1d1417, 0x12191c13, \ + 0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03 );\ + SUBSH_MASK[4] = _mm512_set_epi32( \ + 0x3732393c, 0x3f363138, 0x3b3e3530, 0x333a3d34, \ + 0x2722292c, 0x2f262128, 0x2b2e2520, 0x232a2d24, \ + 0x1712191c, 0x1f161118, 0x1b1e1510, 0x131a1d14, \ + 0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04 );\ + SUBSH_MASK[5] = _mm512_set_epi32( \ + 0x30333b3e, 0x3937323a, 0x3d383631, 0x343c3f35, \ + 0x20232b2e, 0x2927222a, 0x2d282621, 0x242c2f25, \ + 0x10131b1e, 0x1917121a, 0x1d181611, 0x141c1f15, \ + 0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05 );\ + SUBSH_MASK[6] = _mm512_set_epi32( \ + 0x31343d38, 0x3b30333c, 0x3f3a3732, 0x353e3936, \ + 0x21242d28, 0x2b20232c, 0x2f2a2722, 0x252e2926, \ + 0x11141d18, 0x1b10131c, 0x1f1a1712, 0x151e1916, \ + 0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906 );\ + SUBSH_MASK[7] = _mm512_set_epi32( \ + 0x32353f3a, 0x3d31343e, 0x393c3033, 0x36383b37, \ + 0x22252f2a, 0x2d21242e, 0x292c2023, 0x26282b27, \ + 0x12151f1a, 0x1d11141e, 0x191c1013, 0x16181b17, \ + 0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07 );\ + for ( i = 0; i < ROUNDS512; i++ ) \ + {\ + ROUND_CONST_L0[i] = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \ + 0x70605040 ^ ( i * 0x01010101 ), 0x30201000 ^ ( i * 0x01010101 ) ); \ + ROUND_CONST_L7[i] = _mm512_set4_epi32( 0x8f9fafbf ^ ( i * 0x01010101 ), \ + 0xcfdfefff ^ ( i * 0x01010101 ), 0x00000000, 0x00000000 ); \ + }\ + ROUND_CONST_Lx = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \ + 0x00000000, 0x00000000 ); \ +}while(0);\ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a4 = _mm_xor_si128(a4, b1);\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + a0 = _mm512_xor_si512( a0, (ROUND_CONST_L0[i]) );\ + a1 = _mm512_xor_si512( a1, b1 );\ + a2 = _mm512_xor_si512( a2, b1 );\ + a3 = _mm512_xor_si512( a3, b1 );\ + a4 = _mm512_xor_si512( a4, b1 );\ + a5 = _mm512_xor_si512( a5, b1 );\ + a6 = _mm512_xor_si512( a6, b1 );\ + a7 = _mm512_xor_si512( a7, (ROUND_CONST_L7[i]) );\ \ /* ShiftBytes + SubBytes (interleaved) */\ - b0 = _mm_xor_si128(b0, b0);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a0 = _mm_aesenclast_si128(a0, b0);\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a1 = _mm_aesenclast_si128(a1, b0);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a2 = _mm_aesenclast_si128(a2, b0);\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a3 = _mm_aesenclast_si128(a3, b0);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a4 = _mm_aesenclast_si128(a4, b0);\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a5 = _mm_aesenclast_si128(a5, b0);\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a6 = _mm_aesenclast_si128(a6, b0);\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - a7 = _mm_aesenclast_si128(a7, b0);\ + b0 = _mm512_xor_si512( b0, b0 );\ + a0 = _mm512_shuffle_epi8( a0, (SUBSH_MASK[0]) );\ + a0 = _mm512_aesenclast_epi128(a0, b0 );\ + a1 = _mm512_shuffle_epi8( a1, (SUBSH_MASK[1]) );\ + a1 = _mm512_aesenclast_epi128(a1, b0 );\ + a2 = _mm512_shuffle_epi8( a2, (SUBSH_MASK[2]) );\ + a2 = _mm512_aesenclast_epi128(a2, b0 );\ + a3 = _mm512_shuffle_epi8( a3, (SUBSH_MASK[3]) );\ + a3 = _mm512_aesenclast_epi128(a3, b0 );\ + a4 = _mm512_shuffle_epi8( a4, (SUBSH_MASK[4]) );\ + a4 = _mm512_aesenclast_epi128(a4, b0 );\ + a5 = _mm512_shuffle_epi8( a5, (SUBSH_MASK[5]) );\ + a5 = _mm512_aesenclast_epi128(a5, b0 );\ + a6 = _mm512_shuffle_epi8( a6, (SUBSH_MASK[6]) );\ + a6 = _mm512_aesenclast_epi128(a6, b0 );\ + a7 = _mm512_shuffle_epi8( a7, (SUBSH_MASK[7]) );\ + a7 = _mm512_aesenclast_epi128( a7, b0 );\ \ /* MixBytes */\ MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ @@ -237,31 +275,31 @@ __m128i ALL_FF; #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ t0 = TRANSP_MASK;\ \ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ + i0 = _mm512_shuffle_epi8( i0, t0 );\ + i1 = _mm512_shuffle_epi8( i1, t0 );\ + i2 = _mm512_shuffle_epi8( i2, t0 );\ + i3 = _mm512_shuffle_epi8( i3, t0 );\ \ o1 = i0;\ t0 = i2;\ \ - i0 = _mm_unpacklo_epi16(i0, i1);\ - o1 = _mm_unpackhi_epi16(o1, i1);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - t0 = _mm_unpackhi_epi16(t0, i3);\ + i0 = _mm512_unpacklo_epi16( i0, i1 );\ + o1 = _mm512_unpackhi_epi16( o1, i1 );\ + i2 = _mm512_unpacklo_epi16( i2, i3 );\ + t0 = _mm512_unpackhi_epi16( t0, i3 );\ \ - i0 = _mm_shuffle_epi32(i0, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - t0 = _mm_shuffle_epi32(t0, 216);\ + i0 = _mm512_shuffle_epi32( i0, 216 );\ + o1 = _mm512_shuffle_epi32( o1, 216 );\ + i2 = _mm512_shuffle_epi32( i2, 216 );\ + t0 = _mm512_shuffle_epi32( t0, 216 );\ \ o2 = i0;\ o3 = o1;\ \ - i0 = _mm_unpacklo_epi32(i0, i2);\ - o1 = _mm_unpacklo_epi32(o1, t0);\ - o2 = _mm_unpackhi_epi32(o2, i2);\ - o3 = _mm_unpackhi_epi32(o3, t0);\ + i0 = _mm512_unpacklo_epi32( i0, i2 );\ + o1 = _mm512_unpacklo_epi32( o1, t0 );\ + o2 = _mm512_unpackhi_epi32( o2, i2 );\ + o3 = _mm512_unpackhi_epi32( o3, t0 );\ }/**/ /* Matrix Transpose Step 2 @@ -279,19 +317,19 @@ __m128i ALL_FF; #define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ o1 = i0;\ o2 = i1;\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - o1 = _mm_unpackhi_epi64(o1, i4);\ + i0 = _mm512_unpacklo_epi64( i0, i4 );\ + o1 = _mm512_unpackhi_epi64( o1, i4 );\ o3 = i1;\ o4 = i2;\ - o2 = _mm_unpacklo_epi64(o2, i5);\ - o3 = _mm_unpackhi_epi64(o3, i5);\ + o2 = _mm512_unpacklo_epi64( o2, i5 );\ + o3 = _mm512_unpackhi_epi64( o3, i5 );\ o5 = i2;\ o6 = i3;\ - o4 = _mm_unpacklo_epi64(o4, i6);\ - o5 = _mm_unpackhi_epi64(o5, i6);\ + o4 = _mm512_unpacklo_epi64( o4, i6 );\ + o5 = _mm512_unpackhi_epi64( o5, i6 );\ o7 = i3;\ - o6 = _mm_unpacklo_epi64(o6, i7);\ - o7 = _mm_unpackhi_epi64(o7, i7);\ + o6 = _mm512_unpacklo_epi64( o6, i7 );\ + o7 = _mm512_unpackhi_epi64( o7, i7 );\ }/**/ /* Matrix Transpose Inverse Step 2 @@ -302,19 +340,20 @@ __m128i ALL_FF; */ #define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ o0 = i0;\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o0 = _mm_unpackhi_epi64(o0, i1);\ + i0 = _mm512_unpacklo_epi64( i0, i1 );\ + o0 = _mm512_unpackhi_epi64( o0, i1 );\ o1 = i2;\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - o1 = _mm_unpackhi_epi64(o1, i3);\ + i2 = _mm512_unpacklo_epi64( i2, i3 );\ + o1 = _mm512_unpackhi_epi64( o1, i3 );\ o2 = i4;\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - o2 = _mm_unpackhi_epi64(o2, i5);\ + i4 = _mm512_unpacklo_epi64( i4, i5 );\ + o2 = _mm512_unpackhi_epi64( o2, i5 );\ o3 = i6;\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - o3 = _mm_unpackhi_epi64(o3, i7);\ + i6 = _mm512_unpacklo_epi64( i6, i7 );\ + o3 = _mm512_unpackhi_epi64( o3, i7 );\ }/**/ + /* Matrix Transpose Output Step 2 * input is one 512-bit state with two rows in one xmm * output is one 512-bit state with one row in the low 64-bits of one xmm @@ -322,19 +361,19 @@ __m128i ALL_FF; * outputs: (i0-7) = (0|S) */ #define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - t0 = _mm_xor_si128(t0, t0);\ + t0 = _mm512_xor_si512( t0, t0 );\ i1 = i0;\ i3 = i2;\ i5 = i4;\ i7 = i6;\ - i0 = _mm_unpacklo_epi64(i0, t0);\ - i1 = _mm_unpackhi_epi64(i1, t0);\ - i2 = _mm_unpacklo_epi64(i2, t0);\ - i3 = _mm_unpackhi_epi64(i3, t0);\ - i4 = _mm_unpacklo_epi64(i4, t0);\ - i5 = _mm_unpackhi_epi64(i5, t0);\ - i6 = _mm_unpacklo_epi64(i6, t0);\ - i7 = _mm_unpackhi_epi64(i7, t0);\ + i0 = _mm512_unpacklo_epi64( i0, t0 );\ + i1 = _mm512_unpackhi_epi64( i1, t0 );\ + i2 = _mm512_unpacklo_epi64( i2, t0 );\ + i3 = _mm512_unpackhi_epi64( i3, t0 );\ + i4 = _mm512_unpacklo_epi64( i4, t0 );\ + i5 = _mm512_unpackhi_epi64( i5, t0 );\ + i6 = _mm512_unpacklo_epi64( i6, t0 );\ + i7 = _mm512_unpackhi_epi64( i7, t0 );\ }/**/ /* Matrix Transpose Output Inverse Step 2 @@ -344,17 +383,18 @@ __m128i ALL_FF; * outputs: (i0, i2, i4, i6) = S */ #define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ + i0 = _mm512_unpacklo_epi64( i0, i1 );\ + i2 = _mm512_unpacklo_epi64( i2, i3 );\ + i4 = _mm512_unpacklo_epi64( i4, i5 );\ + i6 = _mm512_unpacklo_epi64( i6, i7 );\ }/**/ -void INIT256( __m128i* chaining ) + +void INIT256_4way( __m512i* chaining ) { - static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; - static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; + static __m512i xmm0, xmm2, xmm6, xmm7; + static __m512i xmm12, xmm13, xmm14, xmm15; /* load IV into registers xmm12 - xmm15 */ xmm12 = chaining[0]; @@ -373,17 +413,13 @@ void INIT256( __m128i* chaining ) chaining[3] = xmm7; } -void TF512( __m128i* chaining, __m128i* message ) +void TF512_4way( __m512i* chaining, __m512i* message ) { - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; - -#ifdef IACA_TRACE - IACA_START; -#endif + static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m512i TEMP0; + static __m512i TEMP1; + static __m512i TEMP2; /* load message into registers xmm12 - xmm15 */ xmm12 = message[0]; @@ -404,10 +440,10 @@ void TF512( __m128i* chaining, __m128i* message ) /* xor message to CV get input of P */ /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - xmm8 = _mm_xor_si128(xmm8, xmm12); - xmm0 = _mm_xor_si128(xmm0, xmm2); - xmm4 = _mm_xor_si128(xmm4, xmm6); - xmm5 = _mm_xor_si128(xmm5, xmm7); + xmm8 = _mm512_xor_si512( xmm8, xmm12 ); + xmm0 = _mm512_xor_si512( xmm0, xmm2 ); + xmm4 = _mm512_xor_si512( xmm4, xmm6 ); + xmm5 = _mm512_xor_si512( xmm5, xmm7 ); /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ @@ -422,17 +458,17 @@ void TF512( __m128i* chaining, __m128i* message ) /* xor output of P and Q */ /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm10); - xmm2 = _mm_xor_si128(xmm2, xmm12); - xmm3 = _mm_xor_si128(xmm3, xmm14); + xmm0 = _mm512_xor_si512( xmm0, xmm8 ); + xmm1 = _mm512_xor_si512( xmm1, xmm10 ); + xmm2 = _mm512_xor_si512( xmm2, xmm12 ); + xmm3 = _mm512_xor_si512( xmm3, xmm14 ); /* xor CV (feed-forward) */ /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, (chaining[0])); - xmm1 = _mm_xor_si128(xmm1, (chaining[1])); - xmm2 = _mm_xor_si128(xmm2, (chaining[2])); - xmm3 = _mm_xor_si128(xmm3, (chaining[3])); + xmm0 = _mm512_xor_si512( xmm0, (chaining[0]) ); + xmm1 = _mm512_xor_si512( xmm1, (chaining[1]) ); + xmm2 = _mm512_xor_si512( xmm2, (chaining[2]) ); + xmm3 = _mm512_xor_si512( xmm3, (chaining[3]) ); /* store CV */ chaining[0] = xmm0; @@ -440,19 +476,16 @@ void TF512( __m128i* chaining, __m128i* message ) chaining[2] = xmm2; chaining[3] = xmm3; -#ifdef IACA_TRACE - IACA_END; -#endif return; } -void OF512( __m128i* chaining ) +void OF512_4way( __m512i* chaining ) { - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; + static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; + static __m512i TEMP0; + static __m512i TEMP1; + static __m512i TEMP2; /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ xmm8 = chaining[0]; @@ -475,10 +508,10 @@ void OF512( __m128i* chaining ) /* xor CV to P output (feed-forward) */ /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm10 = _mm_xor_si128(xmm10, (chaining[1])); - xmm12 = _mm_xor_si128(xmm12, (chaining[2])); - xmm14 = _mm_xor_si128(xmm14, (chaining[3])); + xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) ); + xmm10 = _mm512_xor_si512( xmm10, (chaining[1]) ); + xmm12 = _mm512_xor_si512( xmm12, (chaining[2]) ); + xmm14 = _mm512_xor_si512( xmm14, (chaining[3]) ); /* transform state back from row ordering into column ordering */ /* result: final hash value in xmm9, xmm11 */ @@ -489,4 +522,5 @@ void OF512( __m128i* chaining ) chaining[3] = xmm11; } - +#endif // VAES +#endif // GROESTL512_INTR_4WAY_H__ diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c index 59ef1bf..d3b5ca0 100644 --- a/algo/groestl/groestl512-hash-4way.c +++ b/algo/groestl/groestl512-hash-4way.c @@ -15,38 +15,22 @@ #include "miner.h" #include "simd-utils.h" -#if defined(__VAES__) - -#define ROTL64(a,n) \ - ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff ) - -#define U64BIG(a) \ - ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \ - ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \ - ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \ - ( ROTL64(a,56) & 0xFF000000FF000000 ) ) +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) { int i; - ctx->hashlen = hashlen; SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) return 1; - for ( i = 0; i < SIZE512; i++ ) - { - ctx->chaining[i] = m512_zero; - ctx->buffer[i] = m512_zero; - } + memset_zero_512( ctx->chaining, SIZE512 ); + memset_zero_512( ctx->buffer, SIZE512 ); // The only non-zero in the IV is len. It can be hard coded. ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 ); -// uint64_t len = U64BIG((uint64_t)LENGTH); -// ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 ); -// INIT_4way(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -58,7 +42,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, const void* input, uint64_t databitlen ) { const int len = (int)databitlen / 128; - const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hashlen_m128i = 64 / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; int blocks = len / SIZE512; @@ -67,16 +51,13 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, // --- update --- - // digest any full blocks, process directly from input for ( i = 0; i < blocks; i++ ) TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; - // copy any remaining data to buffer, it may already contain data - // from a previous update for a midstate precalc for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; - i += rem; // use i as rem_ptr in final + i += rem; //--- final --- @@ -90,23 +71,71 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output, } else { - // add first padding ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); - // add zero padding for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = m512_zero; - - // add length padding, second last byte is zero unless blocks > 255 ctx->buffer[i] = m512_const1_128( _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) ); } -// digest final padding block and do output transform TF1024_4way( ctx->chaining, ctx->buffer ); - OF1024_4way( ctx->chaining ); - // store hash result in output + for ( i = 0; i < hashlen_m128i; i++ ) + casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; + + return 0; +} + +int groestl512_4way_full( groestl512_4way_context* ctx, void* output, + const void* input, uint64_t datalen ) +{ + const int len = (int)datalen >> 4; + const int hashlen_m128i = 64 >> 4; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; + uint64_t blocks = len / SIZE512; + __m512i* in = (__m512i*)input; + int i; + + // --- init --- + + SET_CONSTANTS(); + memset_zero_512( ctx->chaining, SIZE512 ); + memset_zero_512( ctx->buffer, SIZE512 ); + ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + + // --- update --- + + for ( i = 0; i < blocks; i++ ) + TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] ); + ctx->buf_ptr = blocks * SIZE512; + + for ( i = 0; i < len % SIZE512; i++ ) + ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ]; + i += ctx->rem_ptr; + + // --- close --- + + blocks++; + + if ( i == SIZE512 - 1 ) + { + // only 1 vector left in buffer, all padding at once + ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); + } + else + { + ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 ); + for ( i += 1; i < SIZE512 - 1; i++ ) + ctx->buffer[i] = m512_zero; + ctx->buffer[i] = m512_const2_64( blocks << 56, 0 ); + } + + TF1024_4way( ctx->chaining, ctx->buffer ); + OF1024_4way( ctx->chaining ); + for ( i = 0; i < hashlen_m128i; i++ ) casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ]; diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h index ab3acc6..68ac7e5 100644 --- a/algo/groestl/groestl512-hash-4way.h +++ b/algo/groestl/groestl512-hash-4way.h @@ -1,11 +1,3 @@ -/* hash.h Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - #if !defined(GROESTL512_HASH_4WAY_H__) #define GROESTL512_HASH_4WAY_H__ 1 @@ -18,11 +10,9 @@ #endif #include -#define LENGTH (512) +#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -//#include "brg_endian.h" -//#define NEED_UINT_64T -//#include "algo/sha/brg_types.h" +#define LENGTH (512) /* some sizes (number of bytes) */ #define ROWS (8) @@ -44,34 +34,11 @@ #define ROUNDS (ROUNDS1024) //#endif -/* -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) -#define U64BIG(a) (a) -#endif // IS_BIG_ENDIAN - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define U64BIG(a) \ - ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ - (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ - (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ - (ROTL64(a,56) & li_64(FF000000FF000000))) -#endif // IS_LITTLE_ENDIAN - -typedef unsigned char BitSequence_gr; -typedef unsigned long long DataLength_gr; -typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr; -*/ - #define SIZE512 (SIZE_1024/16) typedef struct { __attribute__ ((aligned (128))) __m512i chaining[SIZE512]; __attribute__ ((aligned (64))) __m512i buffer[SIZE512]; - int hashlen; // byte int blk_count; // SIZE_m128i int buf_ptr; // __m128i offset int rem_ptr; @@ -85,10 +52,11 @@ int groestl512_4way_init( groestl512_4way_context*, uint64_t ); int groestl512_4way_update( groestl512_4way_context*, const void*, uint64_t ); - int groestl512_4way_close( groestl512_4way_context*, void* ); - int groestl512_4way_update_close( groestl512_4way_context*, void*, const void*, uint64_t ); +int groestl512_4way_full( groestl512_4way_context*, void*, + const void*, uint64_t ); -#endif /* __hash_h */ +#endif // VAES +#endif // GROESTL512_HASH_4WAY_H__ diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c index bff0360..f52bc96 100644 --- a/algo/groestl/myr-groestl.c +++ b/algo/groestl/myr-groestl.c @@ -1,22 +1,20 @@ #include "myrgr-gate.h" - #include #include #include #include - -#ifdef NO_AES_NI - #include "sph_groestl.h" -#else +#ifdef __AES__ #include "aes_ni/hash-groestl.h" +#else + #include "sph_groestl.h" #endif #include typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else +#ifdef __AES__ hashState_groestl groestl; +#else + sph_groestl512_context groestl; #endif SHA256_CTX sha; } myrgr_ctx_holder; @@ -25,10 +23,10 @@ myrgr_ctx_holder myrgr_ctx; void init_myrgr_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init( &myrgr_ctx.groestl ); -#else +#ifdef __AES__ init_groestl ( &myrgr_ctx.groestl, 64 ); +#else + sph_groestl512_init( &myrgr_ctx.groestl ); #endif SHA256_Init( &myrgr_ctx.sha ); } @@ -40,12 +38,12 @@ void myriad_hash(void *output, const void *input) uint32_t _ALIGN(32) hash[16]; -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, input, 80); - sph_groestl512_close(&ctx.groestl, hash); -#else +#ifdef __AES__ update_groestl( &ctx.groestl, (char*)input, 640 ); final_groestl( &ctx.groestl, (char*)hash); +#else + sph_groestl512(&ctx.groestl, input, 80); + sph_groestl512_close(&ctx.groestl, hash); #endif SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 ); diff --git a/algo/heavy/bastion.c b/algo/heavy/bastion.c index e962bcd..9c17661 100644 --- a/algo/heavy/bastion.c +++ b/algo/heavy/bastion.c @@ -1,13 +1,10 @@ #include "algo-gate-api.h" - #include #include #include #include #include - #include "sph_hefty1.h" - #include "algo/luffa/sph_luffa.h" #include "algo/fugue/sph_fugue.h" #include "algo/skein/sph_skein.h" @@ -16,8 +13,7 @@ #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/luffa/luffa_for_sse2.h" - -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #endif @@ -25,30 +21,23 @@ void bastionhash(void *output, const void *input) { unsigned char hash[64] __attribute__ ((aligned (64))); -#ifdef NO_AES_NI - sph_echo512_context ctx_echo; +#ifdef __AES__ + hashState_echo ctx_echo; #else - hashState_echo ctx_echo; + sph_echo512_context ctx_echo; #endif - hashState_luffa ctx_luffa; + hashState_luffa ctx_luffa; sph_fugue512_context ctx_fugue; sph_whirlpool_context ctx_whirlpool; sph_shabal512_context ctx_shabal; sph_hamsi512_context ctx_hamsi; sph_skein512_context ctx_skein; -// unsigned char hashbuf[128] __attribute__ ((aligned (16))); -// sph_u64 hashctA; -// sph_u64 hashctB; -// size_t hashptr; - HEFTY1(input, 80, hash); init_luffa( &ctx_luffa, 512 ); update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); if (hash[0] & 0x8) { @@ -71,23 +60,19 @@ void bastionhash(void *output, const void *input) if (hash[0] & 0x8) { -#ifdef NO_AES_NI +#ifdef __AES__ + init_echo( &ctx_echo, 512 ); + update_final_echo ( &ctx_echo,(BitSequence*)hash, + (const BitSequence*)hash, 512 ); +#else sph_echo512_init(&ctx_echo); sph_echo512(&ctx_echo, hash, 64); sph_echo512_close(&ctx_echo, hash); -#else - init_echo( &ctx_echo, 512 ); - update_final_echo ( &ctx_echo,(BitSequence*)hash, - (const BitSequence*)hash, 512 ); -// update_echo ( &ctx_echo, hash, 512 ); -// final_echo( &ctx_echo, hash ); #endif } else { - init_luffa( &ctx_luffa, 512 ); - update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); + init_luffa( &ctx_luffa, 512 ); + update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, + (const BitSequence*)hash, 64 ); } sph_shabal512_init(&ctx_shabal); @@ -119,11 +104,9 @@ void bastionhash(void *output, const void *input) sph_hamsi512(&ctx_hamsi, hash, 64); sph_hamsi512_close(&ctx_hamsi, hash); } else { - init_luffa( &ctx_luffa, 512 ); - update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); -// update_luffa( &ctx_luffa, hash, 64 ); -// final_luffa( &ctx_luffa, hash ); + init_luffa( &ctx_luffa, 512 ); + update_and_final_luffa( &ctx_luffa, (BitSequence*)hash, + (const BitSequence*)hash, 64 ); } memcpy(output, hash, 32); diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c index 2c0b030..4f575dd 100644 --- a/algo/hodl/hodl-gate.c +++ b/algo/hodl/hodl-gate.c @@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate ) // return false; // } pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads ); - gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT; + gate->optimizations = SSE42_OPT | AES_OPT | AVX2_OPT; gate->scanhash = (void*)&hodl_scanhash; gate->get_new_work = (void*)&hodl_get_new_work; gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call; diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index a0bc6b7..452bc8a 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -41,57 +41,10 @@ extern "C"{ #endif - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH -#define SPH_SMALL_FOOTPRINT_JH 1 -#endif - -#if !defined SPH_JH_64 && SPH_64_TRUE -#define SPH_JH_64 1 -#endif - -#if !SPH_64 -#undef SPH_JH_64 -#endif - #ifdef _MSC_VER #pragma warning (disable: 4146) #endif -/* - * The internal bitslice representation may use either big-endian or - * little-endian (true bitslice operations do not care about the bit - * ordering, and the bit-swapping linear operations in JH happen to - * be invariant through endianness-swapping). The constants must be - * defined according to the chosen endianness; we use some - * byte-swapping macros for that. - */ - -#if SPH_LITTLE_ENDIAN - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#endif - -#else - -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#endif - -#endif - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define Sb_8W(x0, x1, x2, x3, c) \ @@ -152,8 +105,97 @@ do { \ x3 = _mm256_xor_si256( x3, x4 ); \ } while (0) -#if SPH_JH_64 +static const uint64_t C[] = +{ + 0x67f815dfa2ded572, 0x571523b70a15847b, + 0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e, + 0x9cfa455ce03a98ea, 0x9a99b26699d2c503, + 0x8a53bbf2b4960266, 0x31a2db881a1456b5, + 0xdb0e199a5c5aa303, 0x1044c1870ab23f40, + 0x1d959e848019051c, 0xdccde75eadeb336f, + 0x416bbf029213ba10, 0xd027bbf7156578dc, + 0x5078aa3739812c0a, 0xd3910041d2bf1a3f, + 0x907eccf60d5a2d42, 0xce97c0929c9f62dd, + 0xac442bc70ba75c18, 0x23fcc663d665dfd1, + 0x1ab8e09e036c6e97, 0xa8ec6c447e450521, + 0xfa618e5dbb03f1ee, 0x97818394b29796fd, + 0x2f3003db37858e4a, 0x956a9ffb2d8d672a, + 0x6c69b8f88173fe8a, 0x14427fc04672c78a, + 0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475, + 0xbc88e4aeb775de52, 0xf4a3a6981e00b882, + 0x1563a3a9338ff48e, 0x89f9b7d524565faa, + 0xfde05a7c20edf1b6, 0x362c42065ae9ca36, + 0x3d98fe4e433529ce, 0xa74b9a7374f93a53, + 0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e, + 0x6a6234ee670605a7, 0x2717b96ebe280b8b, + 0x3f1080c626077447, 0x7b487ec66f7ea0e0, + 0xc0a4f84aa50a550d, 0x9ef18e979fe7e391, + 0xd48d605081727686, 0x62b0e5f3415a9e7e, + 0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3, + 0xd895fa9df594d74f, 0xa554c324117e2e55, + 0x286efebd2872df5b, 0xb2c4a50fe27ff578, + 0x2ed349eeef7c8905, 0x7f5928eb85937e44, + 0x4a3124b337695f70, 0x65e4d61df128865e, + 0xe720b95104771bc7, 0x8a87d423e843fe74, + 0xf2947692a3e8297d, 0xc1d9309b097acbdd, + 0xe01bdc5bfb301b1d, 0xbf829cf24f4924da, + 0xffbf70b431bae7a4, 0x48bcf8de0544320d, + 0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45, + 0x0f09aef7fd05c9e5, 0x34f1904212347094, + 0x95ed44e301b771a2, 0x4a982f4f368e3be9, + 0x15f66ca0631d4088, 0xffaf52874b44c147, + 0x30c60ae2f14abb7e, 0xe68c6eccc5b67046, + 0x00ca4fbd56a4d5a4, 0xae183ec84b849dda, + 0xadd1643045ce5773, 0x67255c1468cea6e8, + 0x16e10ecbf28cdaa3, 0x9a99949a5806e933, + 0x7b846fc220b2601f, 0x1885d1a07facced1, + 0xd319dd8da15b5932, 0x46b4a5aac01c9a50, + 0xba6b04e467633d9f, 0x7eee560bab19caf6, + 0x742128a9ea79b11f, 0xee51363b35f7bde9, + 0x76d350755aac571d, 0x01707da3fec2463a, + 0x42d8a498afc135f7, 0x79676b9e20eced78, + 0xa8db3aea15638341, 0x832c83324d3bc3fa, + 0xf347271c1f3b40a7, 0x9a762db734f04059, + 0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8, + 0xdaeb492b490c9b8d, 0x0d70f36849d7a25b, + 0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5, + 0x533b1036f4a2b8a0, 0x5aec3e759e07a80c, + 0x4f88e85692946891, 0x4cbcbaf8555cb05b, + 0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75, + 0x6db334dc28acae64, 0x71db28b850a5346c, + 0x2a518d10f2e261f8, 0xfc75dd593364dbe3, + 0xa23fce43f1bcac1c, 0xb043e8023cd1bb67, + 0x75a12988ca5b0a33, 0x5c5316b44d19347f, + 0x1e4d790ec3943b92, 0x3fafeeb6d7757479, + 0x21391abef7d4a8ea, 0x5127234c097ef45c, + 0xd23c32ba5324a326, 0xadd5a66d4a17a344, + 0x08c9f2afa63e1db5, 0x563c6b91983d5983, + 0x4d608672a17cf84c, 0xf6c76e08cc3ee246, + 0x5e76bcb1b333982f, 0x2ae6c4efa566d62b, + 0x36d4c1bee8b6f406, 0x6321efbc1582ee74, + 0x69c953f40d4ec1fd, 0x26585806c45a7da7, + 0x16fae0061614c17e, 0x3f9d63283daf907e, + 0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f, + 0x9832e0f216512a74, 0x9af8cee3d830eb0d, + 0x9279f1b57b9ec54b, 0xd36886046ee651ff, + 0x316796e6574d239b, 0x05750a17f3a6e6cc, + 0xce6c3213d98176b1, 0x62a205f88452173c, + 0x47154778b3cb2bf4, 0x486a9323825446ff, + 0x65655e4e0758df38, 0x8e5086fc897cfcf2, + 0x86ca0bd0442e7031, 0x4e477830a20940f0, + 0x8338f7d139eea065, 0xbd3a2ce437e95ef7, + 0x6ff8130126b29721, 0xe7de9fefd1ed44a3, + 0xd992257615dfa08b, 0xbe42dc12f6f7853c, + 0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53, + 0xd86902bd93ce25aa, 0xf908731afd43f65a, + 0xa5194a17daef5fc0, 0x6a21fd4c33664d97, + 0x701541db3198b435, 0x9b54cdedbb0f1eea, + 0x72409751a163d09a, 0xe26f4791bf9d75f6 +}; +// Big endian version + +/* static const sph_u64 C[] = { C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), @@ -240,6 +282,7 @@ static const sph_u64 C[] = { C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) }; +*/ #define Ceven_hi(r) (C[((r) << 2) + 0]) #define Ceven_lo(r) (C[((r) << 2) + 1]) @@ -427,7 +470,7 @@ do { \ h7h = _mm256_xor_si256( h7h, m3h ); \ h7l = _mm256_xor_si256( h7l, m3l ); \ - +/* static const sph_u64 IV256[] = { C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), @@ -450,11 +493,8 @@ static const sph_u64 IV512[] = { C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) }; +*/ -#else - - -#endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -484,57 +524,6 @@ static const sph_u64 IV512[] = { W ## ro(h7); \ } while (0) -#if SPH_SMALL_FOOTPRINT_JH - -#if SPH_JH_64 - -/* - * The "small footprint" 64-bit version just uses a partially unrolled - * loop. - */ - -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -#define E8_8W do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL_8W(0); \ - SL_8W(1); \ - SL_8W(2); \ - SL_8W(3); \ - SL_8W(4); \ - SL_8W(5); \ - SL_8W(6); \ - } \ - } while (0) - -#endif - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#else - - -#endif - -#else - -#if SPH_JH_64 - -/* - * On a "true 64-bit" architecture, we can unroll at will. - */ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -585,6 +574,7 @@ static const sph_u64 IV512[] = { #endif // AVX512 + #define E8 do { \ SLu( 0, 0); \ SLu( 1, 1); \ @@ -630,13 +620,6 @@ static const sph_u64 IV512[] = { SLu(41, 6); \ } while (0) -#else - - -#endif - -#endif - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) void jh256_8way_init( jh_8way_context *sc ) @@ -732,12 +715,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len ) static void jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst, - size_t out_size_w32, const void *iv ) + size_t out_size_w32 ) { __m512i buf[16*4]; __m512i *dst512 = (__m512i*)dst; size_t numz, u; - sph_u64 l0, l1, l0e, l1e; + uint64_t l0, l1; buf[0] = m512_const1_64( 0x80ULL ); @@ -748,12 +731,10 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst, memset_zero_512( buf+1, (numz>>3) - 1 ); - l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3); - l1 = SPH_T64(sc->block_count >> 55); - sph_enc64be( &l0e, l0 ); - sph_enc64be( &l1e, l1 ); - *(buf + (numz>>3) ) = _mm512_set1_epi64( l1e ); - *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e ); + l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 ); + l1 = ( sc->block_count >> 55 ); + *(buf + (numz>>3) ) = _mm512_set1_epi64( bswap_64( l1 ) ); + *(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) ); jh_8way_core( sc, buf, numz + 16 ); @@ -772,7 +753,7 @@ jh256_8way_update(void *cc, const void *data, size_t len) void jh256_8way_close(void *cc, void *dst) { - jh_8way_close(cc, 0, 0, dst, 8, IV256); + jh_8way_close(cc, 0, 0, dst, 8); } void @@ -784,7 +765,7 @@ jh512_8way_update(void *cc, const void *data, size_t len) void jh512_8way_close(void *cc, void *dst) { - jh_8way_close(cc, 0, 0, dst, 16, IV512); + jh_8way_close(cc, 0, 0, dst, 16); } #endif @@ -882,12 +863,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len ) static void jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, - size_t out_size_w32, const void *iv ) + size_t out_size_w32 ) { __m256i buf[16*4]; __m256i *dst256 = (__m256i*)dst; size_t numz, u; - sph_u64 l0, l1, l0e, l1e; + uint64_t l0, l1; buf[0] = m256_const1_64( 0x80ULL ); @@ -898,12 +879,10 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, memset_zero_256( buf+1, (numz>>3) - 1 ); - l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3); - l1 = SPH_T64(sc->block_count >> 55); - sph_enc64be( &l0e, l0 ); - sph_enc64be( &l1e, l1 ); - *(buf + (numz>>3) ) = _mm256_set1_epi64x( l1e ); - *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); + l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 ); + l1 = ( sc->block_count >> 55 ); + *(buf + (numz>>3) ) = _mm256_set1_epi64x( bswap_64( l1 ) ); + *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) ); jh_4way_core( sc, buf, numz + 16 ); @@ -922,7 +901,7 @@ jh256_4way_update(void *cc, const void *data, size_t len) void jh256_4way_close(void *cc, void *dst) { - jh_4way_close(cc, 0, 0, dst, 8, IV256); + jh_4way_close(cc, 0, 0, dst, 8 ); } void @@ -934,7 +913,7 @@ jh512_4way_update(void *cc, const void *data, size_t len) void jh512_4way_close(void *cc, void *dst) { - jh_4way_close(cc, 0, 0, dst, 16, IV512); + jh_4way_close(cc, 0, 0, dst, 16 ); } diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h index 562fd5e..905fbaa 100644 --- a/algo/jh/jh-hash-4way.h +++ b/algo/jh/jh-hash-4way.h @@ -43,7 +43,6 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" #define SPH_SIZE_jh256 256 diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c index 68ffe7f..d514335 100644 --- a/algo/jh/jha-4way.c +++ b/algo/jh/jha-4way.c @@ -65,7 +65,7 @@ void jha_hash_4way( void *out, const void *input ) vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); blake512_4way_init( &ctx_blake ); - blake512_4way( &ctx_blake, vhash, 64 ); + blake512_4way_update( &ctx_blake, vhash, 64 ); blake512_4way_close( &ctx_blake, vhashA ); jh512_4way_init( &ctx_jh ); diff --git a/algo/jh/jha.c b/algo/jh/jha.c index 70467a7..d02b9e5 100644 --- a/algo/jh/jha.c +++ b/algo/jh/jha.c @@ -1,19 +1,16 @@ #include "jha-gate.h" - #include #include #include #include - #include "algo/blake/sph_blake.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" - -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else +#ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" #endif static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64))); @@ -28,12 +25,12 @@ void jha_hash(void *output, const void *input) { uint8_t _ALIGN(128) hash[64]; -#ifdef NO_AES_NI - sph_groestl512_context ctx_groestl; +#ifdef __AES__ + hashState_groestl ctx_groestl; #else - hashState_groestl ctx_groestl; + sph_groestl512_context ctx_groestl; #endif - sph_blake512_context ctx_blake; + sph_blake512_context ctx_blake; sph_jh512_context ctx_jh; sph_keccak512_context ctx_keccak; sph_skein512_context ctx_skein; @@ -46,36 +43,36 @@ void jha_hash(void *output, const void *input) for (int round = 0; round < 3; round++) { if (hash[0] & 0x01) - { -#ifdef NO_AES_NI - sph_groestl512_init(&ctx_groestl); - sph_groestl512(&ctx_groestl, hash, 64 ); - sph_groestl512_close(&ctx_groestl, hash ); + { +#ifdef __AES__ + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (char*)hash, 512 ); #else - init_groestl( &ctx_groestl, 64 ); - update_and_final_groestl( &ctx_groestl, (char*)hash, - (char*)hash, 512 ); + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, hash, 64 ); + sph_groestl512_close(&ctx_groestl, hash ); #endif - } - else - { - sph_skein512_init(&ctx_skein); - sph_skein512(&ctx_skein, hash, 64); - sph_skein512_close(&ctx_skein, hash ); - } + } + else + { + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash ); + } - if (hash[0] & 0x01) - { - sph_blake512_init(&ctx_blake); - sph_blake512(&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, hash ); - } - else - { - sph_jh512_init(&ctx_jh); - sph_jh512(&ctx_jh, hash, 64 ); - sph_jh512_close(&ctx_jh, hash ); - } + if (hash[0] & 0x01) + { + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, hash, 64); + sph_blake512_close(&ctx_blake, hash ); + } + else + { + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, hash, 64 ); + sph_jh512_close(&ctx_jh, hash ); + } } memcpy(output, hash, 32); diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 73f5d9c..07ad122 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -39,7 +39,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, keccakhash_8way( hash, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) @@ -89,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, keccakhash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index 4108be6..46d08cf 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -163,7 +163,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst, unsigned eb; union { __m512i tmp[lim + 1]; - sph_u64 dummy; /* for alignment */ + uint64_t dummy; /* for alignment */ } u; size_t j; size_t m512_len = byte_len >> 3; @@ -344,7 +344,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, unsigned eb; union { __m256i tmp[lim + 1]; - sph_u64 dummy; /* for alignment */ + uint64_t dummy; /* for alignment */ } u; size_t j; size_t m256_len = byte_len >> 3; diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index a353856..5b91bcf 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -43,16 +43,8 @@ extern "C"{ #ifdef __AVX2__ #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" -#define SPH_SIZE_keccak256 256 - -/** - * Output size (in bits) for Keccak-512. - */ -#define SPH_SIZE_keccak512 512 - /** * This structure is a context for Keccak computations: it contains the * intermediate values and some data from the last entered block. Once a diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index b54e3fb..aad56b6 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -459,6 +459,11 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen ) return 0; } +int luffa512_4way_init( luffa_4way_context *state ) +{ + return luffa_4way_init( state, 512 ); +} + // Do not call luffa_update_close after having called luffa_update. // Once luffa_update has been called only call luffa_update or luffa_close. int luffa_4way_update( luffa_4way_context *state, const void *data, @@ -496,6 +501,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data, return 0; } +/* +int luffa512_4way_update( luffa_4way_context *state, const void *data, + size_t len ) +{ + return luffa_4way_update( state, data, len ); +} +*/ + int luffa_4way_close( luffa_4way_context *state, void *hashval ) { __m512i *buffer = (__m512i*)state->buffer; @@ -518,6 +531,77 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval ) return 0; } +/* +int luffa512_4way_close( luffa_4way_context *state, void *hashval ) +{ + return luffa_4way_close( state, hashval ); +} +*/ + +int luffa512_4way_full( luffa_4way_context *state, void *output, + const void *data, size_t inlen ) +{ + state->hashbitlen = 512; + __m128i *iv = (__m128i*)IV; + + state->chainv[0] = m512_const1_128( iv[0] ); + state->chainv[1] = m512_const1_128( iv[1] ); + state->chainv[2] = m512_const1_128( iv[2] ); + state->chainv[3] = m512_const1_128( iv[3] ); + state->chainv[4] = m512_const1_128( iv[4] ); + state->chainv[5] = m512_const1_128( iv[5] ); + state->chainv[6] = m512_const1_128( iv[6] ); + state->chainv[7] = m512_const1_128( iv[7] ); + state->chainv[8] = m512_const1_128( iv[8] ); + state->chainv[9] = m512_const1_128( iv[9] ); + + ((__m512i*)state->buffer)[0] = m512_zero; + ((__m512i*)state->buffer)[1] = m512_zero; + + const __m512i *vdata = (__m512i*)data; + __m512i msg[2]; + int i; + const int blocks = (int)( inlen >> 5 ); + const __m512i shuff_bswap32 = m512_const_64( + 0x3c3d3e3f38393a3b, 0x3435363730313233, + 0x2c2d2e2f28292a2b, 0x2425262720212223, + 0x1c1d1e1f18191a1b, 0x1415161710111213, + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + + state->rembytes = inlen & 0x1F; + + // full blocks + for ( i = 0; i < blocks; i++, vdata+=2 ) + { + msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + rnd512_4way( state, msg ); + } + + // 16 byte partial block exists for 80 byte len + if ( state->rembytes ) + { + // padding of partial block + msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[1] = m512_const2_64( 0, 0x0000000080000000 ); + rnd512_4way( state, msg ); + } + else + { + // empty pad block + msg[0] = m512_const2_64( 0, 0x0000000080000000 ); + msg[1] = m512_zero; + rnd512_4way( state, msg ); + } + + finalization512_4way( state, (uint32*)output ); + + if ( state->hashbitlen > 512 ) + finalization512_4way( state, (uint32*)( output+64 ) ); + + return 0; +} + int luffa_4way_update_close( luffa_4way_context *state, void *output, const void *data, size_t inlen ) { @@ -1031,6 +1115,69 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval ) return 0; } +int luffa512_2way_full( luffa_2way_context *state, void *output, + const void *data, size_t inlen ) +{ + state->hashbitlen = 512; + __m128i *iv = (__m128i*)IV; + + state->chainv[0] = m256_const1_128( iv[0] ); + state->chainv[1] = m256_const1_128( iv[1] ); + state->chainv[2] = m256_const1_128( iv[2] ); + state->chainv[3] = m256_const1_128( iv[3] ); + state->chainv[4] = m256_const1_128( iv[4] ); + state->chainv[5] = m256_const1_128( iv[5] ); + state->chainv[6] = m256_const1_128( iv[6] ); + state->chainv[7] = m256_const1_128( iv[7] ); + state->chainv[8] = m256_const1_128( iv[8] ); + state->chainv[9] = m256_const1_128( iv[9] ); + + ((__m256i*)state->buffer)[0] = m256_zero; + ((__m256i*)state->buffer)[1] = m256_zero; + + const __m256i *vdata = (__m256i*)data; + __m256i msg[2]; + int i; + const int blocks = (int)( inlen >> 5 ); + const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, + 0x1415161710111213, + 0x0c0d0e0f08090a0b, + 0x0405060700010203 ); + + state->rembytes = inlen & 0x1F; + + // full blocks + for ( i = 0; i < blocks; i++, vdata+=2 ) + { + msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + rnd512_2way( state, msg ); + } + + // 16 byte partial block exists for 80 byte len + if ( state->rembytes ) + { + // padding of partial block + msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[1] = m256_const2_64( 0, 0x0000000080000000 ); + rnd512_2way( state, msg ); + } + else + { + // empty pad block + msg[0] = m256_const2_64( 0, 0x0000000080000000 ); + msg[1] = m256_zero; + rnd512_2way( state, msg ); + } + + finalization512_2way( state, (uint32*)output ); + + if ( state->hashbitlen > 512 ) + finalization512_2way( state, (uint32*)( output+32 ) ); + + return 0; +} + int luffa_2way_update_close( luffa_2way_context *state, void *output, const void *data, size_t inlen ) { diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h index 0a27301..ba9bc14 100644 --- a/algo/luffa/luffa-hash-2way.h +++ b/algo/luffa/luffa-hash-2way.h @@ -61,11 +61,23 @@ typedef struct { } luffa_4way_context __attribute((aligned(128))); int luffa_4way_init( luffa_4way_context *state, int hashbitlen ); -int luffa_4way_update( luffa_4way_context *state, const void *data, - size_t len ); -int luffa_4way_close( luffa_4way_context *state, void *hashval ); +//int luffa_4way_update( luffa_4way_context *state, const void *data, +// size_t len ); +//int luffa_4way_close( luffa_4way_context *state, void *hashval ); int luffa_4way_update_close( luffa_4way_context *state, void *output, const void *data, size_t inlen ); +int luffa512_4way_full( luffa_4way_context *state, void *output, + const void *data, size_t inlen ); +int luffa512_4way_init( luffa_4way_context *state ); +int luffa512_4way_update( luffa_4way_context *state, const void *data, + size_t len ); +int luffa512_4way_close( luffa_4way_context *state, void *hashval ); +int luffa512_4way_update_close( luffa_4way_context *state, void *output, + const void *data, size_t inlen ); + +#define luffa_4way_update luffa512_4way_update +#define luffa_4way_close luffa512_4way_close +#define luffa_4way_update_close luffa512_4way_update_close #endif @@ -82,6 +94,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data, int luffa_2way_close( luffa_2way_context *state, void *hashval ); int luffa_2way_update_close( luffa_2way_context *state, void *output, const void *data, size_t inlen ); +int luffa512_2way_full( luffa_2way_context *state, void *output, + const void *data, size_t inlen ); #endif #endif diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index c06f813..318ce26 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -7,33 +7,44 @@ #include "algo/cubehash/cubehash_sse2.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/groestl/aes_ni/hash-groestl256.h" +#if defined(__VAES__) + #include "algo/groestl/groestl256-hash-4way.h" +#endif -#if defined (ALLIUM_8WAY) +#if defined (ALLIUM_16WAY) typedef struct { - blake256_8way_context blake; + blake256_16way_context blake; keccak256_8way_context keccak; cube_4way_context cube; skein256_8way_context skein; +#if defined(__VAES__) + groestl256_4way_context groestl; +#else hashState_groestl256 groestl; -} allium_8way_ctx_holder; +#endif +} allium_16way_ctx_holder; -static __thread allium_8way_ctx_holder allium_8way_ctx; +static __thread allium_16way_ctx_holder allium_16way_ctx; -bool init_allium_8way_ctx() +bool init_allium_16way_ctx() { - keccak256_8way_init( &allium_8way_ctx.keccak ); - cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 ); - skein256_8way_init( &allium_8way_ctx.skein ); - init_groestl256( &allium_8way_ctx.groestl, 32 ); + keccak256_8way_init( &allium_16way_ctx.keccak ); + cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 ); + skein256_8way_init( &allium_16way_ctx.skein ); +#if defined(__VAES__) + groestl256_4way_init( &allium_16way_ctx.groestl, 32 ); +#else + init_groestl256( &allium_16way_ctx.groestl, 32 ); +#endif return true; } -void allium_8way_hash( void *state, const void *input ) +void allium_16way_hash( void *state, const void *input ) { - uint32_t vhash[8*8] __attribute__ ((aligned (128))); - uint32_t vhashA[8*8] __attribute__ ((aligned (64))); - uint32_t vhashB[8*8] __attribute__ ((aligned (64))); + uint32_t vhash[16*8] __attribute__ ((aligned (128))); + uint32_t vhashA[16*8] __attribute__ ((aligned (64))); + uint32_t vhashB[16*8] __attribute__ ((aligned (64))); uint32_t hash0[8] __attribute__ ((aligned (64))); uint32_t hash1[8] __attribute__ ((aligned (64))); uint32_t hash2[8] __attribute__ ((aligned (64))); @@ -42,18 +53,39 @@ void allium_8way_hash( void *state, const void *input ) uint32_t hash5[8] __attribute__ ((aligned (64))); uint32_t hash6[8] __attribute__ ((aligned (64))); uint32_t hash7[8] __attribute__ ((aligned (64))); - allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); + uint32_t hash8[8] __attribute__ ((aligned (64))); + uint32_t hash9[8] __attribute__ ((aligned (64))); + uint32_t hash10[8] __attribute__ ((aligned (64))); + uint32_t hash11[8] __attribute__ ((aligned (64))); + uint32_t hash12[8] __attribute__ ((aligned (64))); + uint32_t hash13[8] __attribute__ ((aligned (64))); + uint32_t hash14[8] __attribute__ ((aligned (64))); + uint32_t hash15[8] __attribute__ ((aligned (64))); + allium_16way_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) ); - blake256_8way_update( &ctx.blake, input + (64<<3), 16 ); - blake256_8way_close( &ctx.blake, vhash ); + memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) ); + blake256_16way_update( &ctx.blake, input + (64<<4), 16 ); + blake256_16way_close( &ctx.blake, vhash ); - rintrlv_8x32_8x64( vhashA, vhash, 256 ); + dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, + vhash, 256 ); + intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + 256 ); + intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14, + hash15, 256 ); + +// rintrlv_8x32_8x64( vhashA, vhash, 256 ); keccak256_8way_update( &ctx.keccak, vhashA, 32 ); - keccak256_8way_close( &ctx.keccak, vhash ); + keccak256_8way_close( &ctx.keccak, vhashA); + keccak256_8way_init( &ctx.keccak ); + keccak256_8way_update( &ctx.keccak, vhashB, 32 ); + keccak256_8way_close( &ctx.keccak, vhashB); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash, 256 ); + vhashA, 256 ); + dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, + vhashB, 256 ); intrlv_2x256( vhash, hash0, hash1, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); @@ -67,6 +99,18 @@ void allium_8way_hash( void *state, const void *input ) intrlv_2x256( vhash, hash6, hash7, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash6, hash7, vhash, 256 ); + intrlv_2x256( vhash, hash8, hash9, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash8, hash9, vhash, 256 ); + intrlv_2x256( vhash, hash10, hash11, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash10, hash11, vhash, 256 ); + intrlv_2x256( vhash, hash12, hash13, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash12, hash13, vhash, 256 ); + intrlv_2x256( vhash, hash14, hash15, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash14, hash15, vhash, 256 ); intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 ); @@ -78,6 +122,17 @@ void allium_8way_hash( void *state, const void *input ) dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 ); + intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 ); + intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 ); + + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 ); + cube_4way_init( &ctx.cube, 256, 16, 32 ); + cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 ); + + dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 ); + dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 ); + intrlv_2x256( vhash, hash0, hash1, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash0, hash1, vhash, 256 ); @@ -90,15 +145,258 @@ void allium_8way_hash( void *state, const void *input ) intrlv_2x256( vhash, hash6, hash7, 256 ); LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); dintrlv_2x256( hash6, hash7, vhash, 256 ); + intrlv_2x256( vhash, hash8, hash9, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash8, hash9, vhash, 256 ); + intrlv_2x256( vhash, hash10, hash11, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash10, hash11, vhash, 256 ); + intrlv_2x256( vhash, hash12, hash13, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash12, hash13, vhash, 256 ); + intrlv_2x256( vhash, hash14, hash15, 256 ); + LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 ); + dintrlv_2x256( hash14, hash15, vhash, 256 ); - intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, + intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); + intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14, + hash15, 256 ); + + skein256_8way_update( &ctx.skein, vhashA, 32 ); + skein256_8way_close( &ctx.skein, vhashA ); + skein256_8way_init( &ctx.skein ); + skein256_8way_update( &ctx.skein, vhashB, 32 ); + skein256_8way_close( &ctx.skein, vhashB ); - skein256_8way_update( &ctx.skein, vhash, 32 ); - skein256_8way_close( &ctx.skein, vhash ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, - vhash, 256 ); + vhashA, 256 ); + dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, + vhashB, 256 ); + +#if defined(__VAES__) + + intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 ); + + groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 ); + + dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 ); + intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 ); + + groestl256_4way_init( &ctx.groestl, 32 ); + groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 ); + + dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 ); + intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 ); + + groestl256_4way_init( &ctx.groestl, 32 ); + groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 ); + + dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 ); + intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 ); + + groestl256_4way_init( &ctx.groestl, 32 ); + groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 ); + + dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 ); + +#else + + update_and_final_groestl256( &ctx.groestl, state, hash0, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 ); + memcpy( &ctx.groestl, &allium_16way_ctx.groestl, + sizeof(hashState_groestl256) ); + update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 ); + +#endif +} + +int scanhash_allium_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*16] __attribute__ ((aligned (128))); + uint32_t vdata[20*16] __attribute__ ((aligned (64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + const uint32_t last_nonce = max_nonce - 16; + const uint32_t Htarg = ptarget[7]; + __m512i *noncev = (__m512i*)vdata + 19; // aligned + int thr_id = mythr->id; // thr_id arg is deprecated + + if ( opt_benchmark ) + ( (uint32_t*)ptarget )[7] = 0x0000ff; + + mm512_bswap32_intrlv80_16x32( vdata, pdata ); + blake256_16way_init( &allium_16way_ctx.blake ); + blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 ); + + do { + *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12, + n+11, n+10, n+ 9, n+ 8, + n+ 7, n+ 6, n+ 5, n+ 4, + n+ 3, n+ 2, n +1, n ) ); + + allium_16way_hash( hash, vdata ); + pdata[19] = n; + + for ( int lane = 0; lane < 16; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg ) + { + if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, hash+(lane<<3), mythr, lane ); + } + } + n += 16; + } while ( (n < last_nonce) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce; + return 0; +} + +#elif defined (ALLIUM_8WAY) + +typedef struct { + blake256_8way_context blake; + keccak256_4way_context keccak; + cubehashParam cube; + skein256_4way_context skein; + hashState_groestl256 groestl; + +} allium_8way_ctx_holder; + +static __thread allium_8way_ctx_holder allium_8way_ctx; + +bool init_allium_8way_ctx() +{ + keccak256_4way_init( &allium_8way_ctx.keccak ); + cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 ); + skein256_4way_init( &allium_8way_ctx.skein ); + init_groestl256( &allium_8way_ctx.groestl, 32 ); + return true; +} + +void allium_8way_hash( void *state, const void *input ) +{ + uint32_t vhashA[8*8] __attribute__ ((aligned (64))); + uint32_t vhashB[8*8] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (32))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t hash2[8] __attribute__ ((aligned (32))); + uint32_t hash3[8] __attribute__ ((aligned (32))); + uint32_t hash4[8] __attribute__ ((aligned (64))); + uint32_t hash5[8] __attribute__ ((aligned (32))); + uint32_t hash6[8] __attribute__ ((aligned (32))); + uint32_t hash7[8] __attribute__ ((aligned (32))); + allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); + + memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) ); + blake256_8way_update( &ctx.blake, input + (64<<3), 16 ); + blake256_8way_close( &ctx.blake, vhashA ); + + dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, + vhashA, 256 ); + intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 ); + intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 ); + + keccak256_4way_update( &ctx.keccak, vhashA, 32 ); + keccak256_4way_close( &ctx.keccak, vhashA ); + keccak256_4way_init( &ctx.keccak ); + keccak256_4way_update( &ctx.keccak, vhashB, 32 ); + keccak256_4way_close( &ctx.keccak, vhashB ); + + dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 ); + dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 ); + + LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); + LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); + LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); + LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); + LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 ); + LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 ); + LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); + LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); + + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 ); + cubehashInit( &ctx.cube, 256, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 ); + + LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); + LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); + LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); + LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); + LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 ); + LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 ); + LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 ); + LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 ); + + intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 ); + intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 ); + + skein256_4way_update( &ctx.skein, vhashA, 32 ); + skein256_4way_close( &ctx.skein, vhashA ); + skein256_4way_init( &ctx.skein ); + skein256_4way_update( &ctx.skein, vhashB, 32 ); + skein256_4way_close( &ctx.skein, vhashB ); + + dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 ); + dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 ); update_and_final_groestl256( &ctx.groestl, state, hash0, 256 ); memcpy( &ctx.groestl, &allium_8way_ctx.groestl, @@ -122,23 +420,21 @@ void allium_8way_hash( void *state, const void *input ) memcpy( &ctx.groestl, &allium_8way_ctx.groestl, sizeof(hashState_groestl256) ); update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 ); - memcpy( &ctx.groestl, &allium_8way_ctx.groestl, - sizeof(hashState_groestl256) ); } int scanhash_allium_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t hash[8*8] __attribute__ ((aligned (128))); + uint32_t hash[8*8] __attribute__ ((aligned (64))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; const uint32_t last_nonce = max_nonce - 8; + uint32_t n = first_nonce; const uint32_t Htarg = ptarget[7]; __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; @@ -169,126 +465,4 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, return 0; } - -#elif defined (ALLIUM_4WAY) - - -typedef struct { - blake256_4way_context blake; - keccak256_4way_context keccak; - cubehashParam cube; - skein256_4way_context skein; - hashState_groestl256 groestl; - -} allium_4way_ctx_holder; - -static __thread allium_4way_ctx_holder allium_4way_ctx; - -bool init_allium_4way_ctx() -{ - keccak256_4way_init( &allium_4way_ctx.keccak ); - cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 ); - skein256_4way_init( &allium_4way_ctx.skein ); - init_groestl256( &allium_4way_ctx.groestl, 32 ); - return true; -} - -void allium_4way_hash( void *state, const void *input ) -{ - uint32_t hash0[8] __attribute__ ((aligned (64))); - uint32_t hash1[8] __attribute__ ((aligned (32))); - uint32_t hash2[8] __attribute__ ((aligned (32))); - uint32_t hash3[8] __attribute__ ((aligned (32))); - uint32_t vhash32[8*4] __attribute__ ((aligned (64))); - uint32_t vhash64[8*4] __attribute__ ((aligned (64))); - allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); - - memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) ); - blake256_4way_update( &ctx.blake, input + (64<<2), 16 ); - blake256_4way_close( &ctx.blake, vhash32 ); - - rintrlv_4x32_4x64( vhash64, vhash32, 256 ); - keccak256_4way_update( &ctx.keccak, vhash64, 32 ); - keccak256_4way_close( &ctx.keccak, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 ); - cubehashInit( &ctx.cube, 256, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 ); - - LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 ); - LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 ); - LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 ); - LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 ); - - intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - - skein256_4way_update( &ctx.skein, vhash64, 32 ); - skein256_4way_close( &ctx.skein, vhash64 ); - - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); - - update_and_final_groestl256( &ctx.groestl, state, hash0, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 ); - memcpy( &ctx.groestl, &allium_4way_ctx.groestl, - sizeof(hashState_groestl256) ); - update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 ); -} - -int scanhash_allium_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t hash[8*4] __attribute__ ((aligned (64))); - uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - uint32_t n = first_nonce; - const uint32_t Htarg = ptarget[7]; - __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - - if ( opt_benchmark ) - ( (uint32_t*)ptarget )[7] = 0x0000ff; - - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256_4way_init( &allium_4way_ctx.blake ); - blake256_4way( &allium_4way_ctx.blake, vdata, 64 ); - - do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) ); - - allium_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg ) - { - if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n + lane; - submit_lane_solution( work, hash+(lane<<3), mythr, lane ); - } - } - n += 4; - } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - return 0; -} - #endif diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index caa6fb0..4218a65 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2rev3; gate->hash = (void*)&lyra2rev3_hash; #endif - gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; gate->miner_thread_init = (void*)&lyra2rev3_thread_init; opt_target_factor = 256.0; return true; @@ -119,7 +119,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2rev2; gate->hash = (void*)&lyra2rev2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; gate->miner_thread_init = (void*)&lyra2rev2_thread_init; opt_target_factor = 256.0; return true; @@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2z; gate->hash = (void*)&lyra2z_hash; #endif - gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; opt_target_factor = 256.0; return true; }; @@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2h; gate->hash = (void*)&lyra2h_hash; #endif - gate->optimizations = SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT; opt_target_factor = 256.0; return true; }; @@ -174,20 +174,20 @@ bool register_lyra2h_algo( algo_gate_t* gate ) bool register_allium_algo( algo_gate_t* gate ) { -#if defined (ALLIUM_8WAY) +#if defined (ALLIUM_16WAY) + gate->miner_thread_init = (void*)&init_allium_16way_ctx; + gate->scanhash = (void*)&scanhash_allium_16way; + gate->hash = (void*)&allium_16way_hash; +#elif defined (ALLIUM_8WAY) gate->miner_thread_init = (void*)&init_allium_8way_ctx; gate->scanhash = (void*)&scanhash_allium_8way; gate->hash = (void*)&allium_8way_hash; -#elif defined (ALLIUM_4WAY) - gate->miner_thread_init = (void*)&init_allium_4way_ctx; - gate->scanhash = (void*)&scanhash_allium_4way; - gate->hash = (void*)&allium_4way_hash; #else gate->miner_thread_init = (void*)&init_allium_ctx; gate->scanhash = (void*)&scanhash_allium; gate->hash = (void*)&allium_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT; opt_target_factor = 256.0; return true; }; @@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) bool register_phi2_algo( algo_gate_t* gate ) { // init_phi2_ctx(); - gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; gate->get_work_data_size = (void*)&phi2_get_work_data_size; gate->decode_extra_data = (void*)&phi2_decode_extra_data; gate->build_extraheader = (void*)&phi2_build_extraheader; diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index c16d7d9..28811a6 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -153,27 +153,27 @@ bool lyra2h_thread_init(); ////////////////////////////////// #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - #define ALLIUM_8WAY 1 + #define ALLIUM_16WAY 1 #elif defined(__AVX2__) && defined(__AES__) - #define ALLIUM_4WAY 1 + #define ALLIUM_8WAY 1 #endif bool register_allium_algo( algo_gate_t* gate ); -#if defined(ALLIUM_8WAY) +#if defined(ALLIUM_16WAY) + +void allium_16way_hash( void *state, const void *input ); +int scanhash_allium_16way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +bool init_allium_16way_ctx(); + +#elif defined(ALLIUM_8WAY) void allium_8way_hash( void *state, const void *input ); int scanhash_allium_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); bool init_allium_8way_ctx(); -#elif defined(ALLIUM_4WAY) - -void allium_4way_hash( void *state, const void *input ); -int scanhash_allium_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); -bool init_allium_4way_ctx(); - #else void allium_hash( void *state, const void *input ); diff --git a/algo/lyra2/lyra2-hash-2way.c b/algo/lyra2/lyra2-hash-2way.c index 76d2d06..21e2b7b 100644 --- a/algo/lyra2/lyra2-hash-2way.c +++ b/algo/lyra2/lyra2-hash-2way.c @@ -575,4 +575,138 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, return 0; } +int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, + const uint64_t pwdlen, const uint64_t timeCost, + const uint64_t nRows, const uint64_t nCols ) +{ + //====================== Basic variables ============================// + uint64_t _ALIGN(256) state[32]; + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa0 = 0; + int64_t rowa1 = 0; + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //====================================================================/ + + //=== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 + : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + i = (int64_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 ); + if (wholeMatrix == NULL) + return -1; + + memset_zero_512( (__m512i*)wholeMatrix, i>>5 ); + + uint64_t *ptrWord = wholeMatrix; + uint64_t *pw = (uint64_t*)pwd; + + //First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) ) + / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1; + + uint64_t *ptr = wholeMatrix; + + memcpy( ptr, pw, 2*pwdlen ); // password + ptr += pwdlen>>2; + memcpy( ptr, pw, 2*pwdlen ); // password lane 1 + ptr += pwdlen>>2; + + // now build the rest interleaving on the fly. + + ptr[0] = ptr[ 4] = kLen; + ptr[1] = ptr[ 5] = pwdlen; + ptr[2] = ptr[ 6] = pwdlen; // saltlen + ptr[3] = ptr[ 7] = timeCost; + ptr[8] = ptr[12] = nRows; + ptr[9] = ptr[13] = nCols; + ptr[10] = ptr[14] = 0x80; + ptr[11] = ptr[15] = 0x0100000000000000; + + absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN ); + + //Initializes M[0] and M[1] + reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here + + reducedDuplexRow1_2way( state, &wholeMatrix[0], + &wholeMatrix[ 2 * ROW_LEN_INT64], nCols ); + + do + { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ], + &wholeMatrix[ 2* row*ROW_LEN_INT64 ], + nCols ); + + //updates the value of row* (deterministically picked during Setup)) + rowa0 = (rowa0 + step) & (window - 1); + //update prev: it now points to the last row ever computed + + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa0 == 0) + { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + + //===================== Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) + { + step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + do + { + rowa0 = state[ 0 ] & (unsigned int)(nRows-1); + rowa1 = state[ 4 ] & (unsigned int)(nRows-1); + + reducedDuplexRow_2way_X( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ], + &wholeMatrix[ 2* row *ROW_LEN_INT64 ], + nCols ); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //---------------------------------------------------- + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //---------------------------------------------------- + + } while (row != 0); + } + + //===================== Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64], + &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] ); + //Squeezes the key + squeeze_2way( state, K, (unsigned int) kLen ); + + //================== Freeing the memory =============================// + _mm_free(wholeMatrix); + + return 0; +} + #endif diff --git a/algo/lyra2/lyra2.h b/algo/lyra2/lyra2.h index 4cd96ff..5ab0b81 100644 --- a/algo/lyra2/lyra2.h +++ b/algo/lyra2/lyra2.h @@ -74,6 +74,9 @@ int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols ); +int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, + uint64_t timeCost, uint64_t nRows, uint64_t nCols ); + #endif #endif /* LYRA2_H_ */ diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c index b86f514..49a4087 100644 --- a/algo/lyra2/lyra2h-4way.c +++ b/algo/lyra2/lyra2h-4way.c @@ -33,7 +33,7 @@ void lyra2h_4way_hash( void *state, const void *input ) blake256_4way_context ctx_blake __attribute__ ((aligned (64))); memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid ); - blake256_4way( &ctx_blake, input + (64*4), 16 ); + blake256_4way_update( &ctx_blake, input + (64*4), 16 ); blake256_4way_close( &ctx_blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c index 90060d8..0718a3a 100644 --- a/algo/lyra2/sponge-2way.c +++ b/algo/lyra2/sponge-2way.c @@ -246,15 +246,32 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn, _mm512_store_si512( (__m512i*)State + 3, state3 ); } -inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, - uint64_t *rowInOut0, uint64_t *rowInOut1, +// reduced duplex row has three version depending on rows inout. +// If they are the same the fastest version can be used, equivalent to +// linear version. +// If either rowinout overlaps with rowout the slowest version is used, +// to refresh local data after overwriting rowout. +// Otherwise the normal version is used, slower than unified, faster than +// overlap. +// +// The likelyhood of each case depends on the number of rows. More rows +// means unified and overlap are both less likely. +// Unified has a 1 in Nrows chances, +// Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both +// overlap it's unified. +// As a result normal is Nrows-2 / Nrows. +// for 4 rows: 1 unified, 2 overlap, 1 normal. +// for 8 rows: 1 unified, 2 overlap, 56 normal. + +static inline void reducedDuplexRow_2way_normal( uint64_t *State, + uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1, uint64_t *rowOut, uint64_t nCols) { int i; register __m512i state0, state1, state2, state3; __m512i *in = (__m512i*)rowIn; - __m256i *inout0 = (__m256i*)rowInOut0; - __m256i *inout1 = (__m256i*)rowInOut1; + __m512i *inout0 = (__m512i*)rowInOut0; + __m512i *inout1 = (__m512i*)rowInOut1; __m512i *out = (__m512i*)rowOut; register __m512i io0, io1, io2; @@ -262,19 +279,19 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, state1 = _mm512_load_si512( (__m512i*)State + 1 ); state2 = _mm512_load_si512( (__m512i*)State + 2 ); state3 = _mm512_load_si512( (__m512i*)State + 3 ); - + for ( i = 0; i < nCols; i++ ) { //Absorbing "M[prev] [+] M[row*]" io0 = _mm512_mask_blend_epi64( 0xf0, - _mm512_load_si512( (__m512i*)inout0 ), - _mm512_load_si512( (__m512i*)inout1 ) ); + _mm512_load_si512( (__m512i*)inout0 ), + _mm512_load_si512( (__m512i*)inout1 ) ); io1 = _mm512_mask_blend_epi64( 0xf0, - _mm512_load_si512( (__m512i*)inout0 +1 ), - _mm512_load_si512( (__m512i*)inout1 +1 ) ); + _mm512_load_si512( (__m512i*)inout0 +1 ), + _mm512_load_si512( (__m512i*)inout1 +1 ) ); io2 = _mm512_mask_blend_epi64( 0xf0, - _mm512_load_si512( (__m512i*)inout0 +2 ), - _mm512_load_si512( (__m512i*)inout1 +2 ) ); + _mm512_load_si512( (__m512i*)inout0 +2 ), + _mm512_load_si512( (__m512i*)inout1 +2 ) ); state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) ); state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) ); @@ -286,29 +303,6 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, { register __m512i t0, t1, t2; - //M[rowOut][col] = M[rowOut][col] XOR rand - t0 = _mm512_xor_si512( out[0], state0 ); - t1 = _mm512_xor_si512( out[1], state1 ); - t2 = _mm512_xor_si512( out[2], state2 ); - - // if out is the same row as inout, update with new data. - if ( rowOut == rowInOut0 ) - { - io0 = _mm512_mask_blend_epi64( 0x0f, io0, t0 ); - io1 = _mm512_mask_blend_epi64( 0x0f, io1, t1 ); - io2 = _mm512_mask_blend_epi64( 0x0f, io2, t2 ); - } - if ( rowOut == rowInOut1 ) - { - io0 = _mm512_mask_blend_epi64( 0xf0, io0, t0 ); - io1 = _mm512_mask_blend_epi64( 0xf0, io1, t1 ); - io2 = _mm512_mask_blend_epi64( 0xf0, io2, t2 ); - } - - out[0] = t0; - out[1] = t1; - out[2] = t2; - //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) t0 = _mm512_permutex_epi64( state0, 0x93 ); t1 = _mm512_permutex_epi64( state1, 0x93 ); @@ -317,19 +311,24 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) ); io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) ); io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) ); + + //M[rowOut][col] = M[rowOut][col] XOR rand + out[0] = _mm512_xor_si512( out[0], state0 ); + out[1] = _mm512_xor_si512( out[1], state1 ); + out[2] = _mm512_xor_si512( out[2], state2 ); } - _mm512_mask_store_epi64( (__m512i*)inout0, 0x0f, io0 ); - _mm512_mask_store_epi64( (__m512i*)inout1, 0xf0, io0 ); - _mm512_mask_store_epi64( (__m512i*)inout0 +1, 0x0f, io1 ); - _mm512_mask_store_epi64( (__m512i*)inout1 +1, 0xf0, io1 ); - _mm512_mask_store_epi64( (__m512i*)inout0 +2, 0x0f, io2 ); - _mm512_mask_store_epi64( (__m512i*)inout1 +2, 0xf0, io2 ); + _mm512_mask_store_epi64( inout0, 0x0f, io0 ); + _mm512_mask_store_epi64( inout1, 0xf0, io0 ); + _mm512_mask_store_epi64( inout0 +1, 0x0f, io1 ); + _mm512_mask_store_epi64( inout1 +1, 0xf0, io1 ); + _mm512_mask_store_epi64( inout0 +2, 0x0f, io2 ); + _mm512_mask_store_epi64( inout1 +2, 0xf0, io2 ); //Goes to next block in += BLOCK_LEN_M256I; - inout0 += BLOCK_LEN_M256I * 2; - inout1 += BLOCK_LEN_M256I * 2; + inout0 += BLOCK_LEN_M256I; + inout1 += BLOCK_LEN_M256I; out += BLOCK_LEN_M256I; } @@ -339,4 +338,333 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, _mm512_store_si512( (__m512i*)State + 3, state3 ); } +static inline void reducedDuplexRow_2way_overlap( uint64_t *State, + uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols) +{ + int i; + register __m512i state0, state1, state2, state3; + __m512i *in = (__m512i*)rowIn; + __m512i *inout0 = (__m512i*)rowInOut0; + __m512i *inout1 = (__m512i*)rowInOut1; + __m512i *out = (__m512i*)rowOut; +// inout_ovly io; + ovly_512 io0, io1, io2; + + state0 = _mm512_load_si512( (__m512i*)State ); + state1 = _mm512_load_si512( (__m512i*)State + 1 ); + state2 = _mm512_load_si512( (__m512i*)State + 2 ); + state3 = _mm512_load_si512( (__m512i*)State + 3 ); + + for ( i = 0; i < nCols; i++ ) + { + //Absorbing "M[prev] [+] M[row*]" + io0.v512 = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 ), + _mm512_load_si512( (__m512i*)inout1 ) ); + io1.v512 = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 +1 ), + _mm512_load_si512( (__m512i*)inout1 +1 ) ); + io2.v512 = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 +2 ), + _mm512_load_si512( (__m512i*)inout1 +2 ) ); + + state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) ); + state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) ); + state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) ); + +/* + io.v512[0] = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 ), + _mm512_load_si512( (__m512i*)inout1 ) ); + io.v512[1] = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 +1 ), + _mm512_load_si512( (__m512i*)inout1 +1 ) ); + io.v512[2] = _mm512_mask_blend_epi64( 0xf0, + _mm512_load_si512( (__m512i*)inout0 +2 ), + _mm512_load_si512( (__m512i*)inout1 +2 ) ); + + state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) ); + state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) ); + state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) ); +*/ + + //Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); + + { + __m512i t0, t1, t2; + + //M[rowOut][col] = M[rowOut][col] XOR rand + out[0] = _mm512_xor_si512( out[0], state0 ); + out[1] = _mm512_xor_si512( out[1], state1 ); + out[2] = _mm512_xor_si512( out[2], state2 ); + + // if out is the same row as inout, update with new data. + if ( rowOut == rowInOut0 ) + { + io0.v512 = _mm512_mask_blend_epi64( 0x0f, io0.v512, out[0] ); + io1.v512 = _mm512_mask_blend_epi64( 0x0f, io1.v512, out[1] ); + io2.v512 = _mm512_mask_blend_epi64( 0x0f, io2.v512, out[2] ); + + } + if ( rowOut == rowInOut1 ) + { + io0.v512 = _mm512_mask_blend_epi64( 0xf0, io0.v512, out[0] ); + io1.v512 = _mm512_mask_blend_epi64( 0xf0, io1.v512, out[1] ); + io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] ); + } + +/* + if ( rowOut == rowInOut0 ) + { + io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] ); + io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] ); + io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] ); + + } + if ( rowOut == rowInOut1 ) + { + io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] ); + io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] ); + io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] ); + } +*/ + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + t0 = _mm512_permutex_epi64( state0, 0x93 ); + t1 = _mm512_permutex_epi64( state1, 0x93 ); + t2 = _mm512_permutex_epi64( state2, 0x93 ); + + io0.v512 = _mm512_xor_si512( io0.v512, + _mm512_mask_blend_epi64( 0x11, t0, t2 ) ); + io1.v512 = _mm512_xor_si512( io1.v512, + _mm512_mask_blend_epi64( 0x11, t1, t0 ) ); + io2.v512 = _mm512_xor_si512( io2.v512, + _mm512_mask_blend_epi64( 0x11, t2, t1 ) ); + } + + casti_m256i( inout0, 0 ) = io0.v256lo; + casti_m256i( inout1, 1 ) = io0.v256hi; + casti_m256i( inout0, 2 ) = io1.v256lo; + casti_m256i( inout1, 3 ) = io1.v256hi; + casti_m256i( inout0, 4 ) = io2.v256lo; + casti_m256i( inout1, 5 ) = io2.v256hi; +/* + _mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] ); + _mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] ); + _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] ); + _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] ); + _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] ); + _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] ); +*/ + //Goes to next block + in += BLOCK_LEN_M256I; + inout0 += BLOCK_LEN_M256I; + inout1 += BLOCK_LEN_M256I; + out += BLOCK_LEN_M256I; + } + + _mm512_store_si512( (__m512i*)State, state0 ); + _mm512_store_si512( (__m512i*)State + 1, state1 ); + _mm512_store_si512( (__m512i*)State + 2, state2 ); + _mm512_store_si512( (__m512i*)State + 3, state3 ); + +} + +static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State, + uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols) +{ + int i; + register __m512i state0, state1, state2, state3; + __m512i *in = (__m512i*)rowIn; + __m256i *inout0 = (__m256i*)rowInOut0; + __m256i *inout1 = (__m256i*)rowInOut1; + __m512i *out = (__m512i*)rowOut; + inout_ovly inout; + __m512i t0, t1, t2; + + state0 = _mm512_load_si512( (__m512i*)State ); + state1 = _mm512_load_si512( (__m512i*)State + 1 ); + state2 = _mm512_load_si512( (__m512i*)State + 2 ); + state3 = _mm512_load_si512( (__m512i*)State + 3 ); + + for ( i = 0; i < nCols; i++ ) + { + + //Absorbing "M[prev] [+] M[row*]" + inout.v256[0] = inout0[0]; + inout.v256[1] = inout1[1]; + inout.v256[2] = inout0[2]; + inout.v256[3] = inout1[3]; + inout.v256[4] = inout0[4]; + inout.v256[5] = inout1[5]; + + state0 = _mm512_xor_si512( state0, + _mm512_add_epi64( in[0], inout.v512[0] ) ); + state1 = _mm512_xor_si512( state1, + _mm512_add_epi64( in[1], inout.v512[1] ) ); + state2 = _mm512_xor_si512( state2, + _mm512_add_epi64( in[2], inout.v512[2] ) ); + + + //Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); + + //M[rowOut][col] = M[rowOut][col] XOR rand + out[0] = _mm512_xor_si512( out[0], state0 ); + out[1] = _mm512_xor_si512( out[1], state1 ); + out[2] = _mm512_xor_si512( out[2], state2 ); + + // if inout is the same row as out it was just overwritten, reload. + if ( rowOut == rowInOut0 ) + { + inout.v256[0] = ( (__m256i*)out )[0]; + inout.v256[2] = ( (__m256i*)out )[2]; + inout.v256[4] = ( (__m256i*)out )[4]; + } + if ( rowOut == rowInOut1 ) + { + inout.v256[1] = ( (__m256i*)out )[1]; + inout.v256[3] = ( (__m256i*)out )[3]; + inout.v256[5] = ( (__m256i*)out )[5]; + } + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + t0 = _mm512_permutex_epi64( state0, 0x93 ); + t1 = _mm512_permutex_epi64( state1, 0x93 ); + t2 = _mm512_permutex_epi64( state2, 0x93 ); + + inout.v512[0] = _mm512_xor_si512( inout.v512[0], + _mm512_mask_blend_epi64( 0x11, t0, t2 ) ); + inout.v512[1] = _mm512_xor_si512( inout.v512[1], + _mm512_mask_blend_epi64( 0x11, t1, t0 ) ); + inout.v512[2] = _mm512_xor_si512( inout.v512[2], + _mm512_mask_blend_epi64( 0x11, t2, t1 ) ); + + inout0[0] = inout.v256[0]; + inout1[1] = inout.v256[1]; + inout0[2] = inout.v256[2]; + inout1[3] = inout.v256[3]; + inout0[4] = inout.v256[4]; + inout1[5] = inout.v256[5]; + + //Goes to next block + in += BLOCK_LEN_M256I; + inout0 += BLOCK_LEN_M256I * 2; + inout1 += BLOCK_LEN_M256I * 2; + out += BLOCK_LEN_M256I; + } + + _mm512_store_si512( (__m512i*)State, state0 ); + _mm512_store_si512( (__m512i*)State + 1, state1 ); + _mm512_store_si512( (__m512i*)State + 2, state2 ); + _mm512_store_si512( (__m512i*)State + 3, state3 ); +} + +// rowInOut0 == rowInOut1, fastest, least likely: 1 / nrows +static inline void reducedDuplexRow_2way_unified( uint64_t *State, + uint64_t *rowIn, uint64_t *rowInOut0, + uint64_t *rowOut, uint64_t nCols) +{ + int i; + register __m512i state0, state1, state2, state3; + __m512i *in = (__m512i*)rowIn; + __m512i *inout = (__m512i*)rowInOut0; + __m512i *out = (__m512i*)rowOut; + + state0 = _mm512_load_si512( (__m512i*)State ); + state1 = _mm512_load_si512( (__m512i*)State + 1 ); + state2 = _mm512_load_si512( (__m512i*)State + 2 ); + state3 = _mm512_load_si512( (__m512i*)State + 3 ); + + for ( i = 0; i < nCols; i++ ) + { + //Absorbing "M[prev] [+] M[row*]" + state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], inout[0] ) ); + state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], inout[1] ) ); + state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], inout[2] ) ); + + //Applies the reduced-round transformation f to the sponge's state + LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 ); + + { + register __m512i t0, t1, t2; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + t0 = _mm512_permutex_epi64( state0, 0x93 ); + t1 = _mm512_permutex_epi64( state1, 0x93 ); + t2 = _mm512_permutex_epi64( state2, 0x93 ); + + inout[0] = _mm512_xor_si512( inout[0], + _mm512_mask_blend_epi64( 0x11, t0, t2 ) ); + inout[1] = _mm512_xor_si512( inout[1], + _mm512_mask_blend_epi64( 0x11, t1, t0 ) ); + inout[2] = _mm512_xor_si512( inout[2], + _mm512_mask_blend_epi64( 0x11, t2, t1 ) ); + + out[0] = _mm512_xor_si512( out[0], state0 ); + out[1] = _mm512_xor_si512( out[1], state1 ); + out[2] = _mm512_xor_si512( out[2], state2 ); + + } + + //Goes to next block + in += BLOCK_LEN_M256I; + inout += BLOCK_LEN_M256I; + out += BLOCK_LEN_M256I; + } + + _mm512_store_si512( (__m512i*)State, state0 ); + _mm512_store_si512( (__m512i*)State + 1, state1 ); + _mm512_store_si512( (__m512i*)State + 2, state2 ); + _mm512_store_si512( (__m512i*)State + 3, state3 ); +} + +// Multi level specialization. +// There are three cases that need to be handled: +// unified: inout data is contiguous, fastest, unlikely. +// normal: inout data is not contiguous with no overlap with out, likely. +// overlap: inout data is not contiguous and one lane overlaps with out +// slowest, unlikely. +// +// In adition different algos prefer different coding. x25x and x22i prefer +// 256 bit memory acceses to handle the diverged data while all other +// algos prefer 512 bit memory accesses with masking and blending. + + +// Wrapper +inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols ) +{ + if ( rowInOut0 == rowInOut1 ) + reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols ); + else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) ) + reducedDuplexRow_2way_overlap( State, rowIn, rowInOut0, rowInOut1, + rowOut, nCols ); + else + reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1, + rowOut, nCols ); +} + +inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols ) +{ + if ( rowInOut0 == rowInOut1 ) + reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols ); + else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) ) + { + asm volatile ( "nop" ); // Prevent GCC from optimizing + reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1, + rowOut, nCols ); + } + else + reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1, + rowOut, nCols ); +} + + #endif // AVX512 diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 185181b..cb8d8d0 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -203,13 +203,24 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -union _povly +union _ovly_512 { - __m512i *v512; - __m256i *v256; - uint64_t *u64; + __m512i v512; + struct + { + __m256i v256lo; + __m256i v256hi; + }; }; -typedef union _povly povly; +typedef union _ovly_512 ovly_512; + + +union _inout_ovly +{ + __m512i v512[3]; + __m256i v256[6]; +}; +typedef union _inout_ovly inout_ovly; //---- Housekeeping void initState_2way( uint64_t State[/*16*/] ); @@ -234,6 +245,10 @@ void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn, + uint64_t *rowInOut0, uint64_t *rowInOut1, + uint64_t *rowOut, uint64_t nCols); + #endif diff --git a/algo/m7m.c b/algo/m7m.c index c2e37ba..9d2ef1b 100644 --- a/algo/m7m.c +++ b/algo/m7m.c @@ -149,7 +149,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, char data_str[161], hash_str[65], target_str[65]; //uint8_t *bdata = 0; uint8_t bdata[8192] __attribute__ ((aligned (64))); - int rc = 0, i, digits; + int i, digits; int bytes; size_t p = sizeof(unsigned long), a = 64/p, b = 32/p; @@ -267,47 +267,36 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, SHA256_Final( (unsigned char*) hash, &ctxf_sha256 ); } - const unsigned char *hash_ = (const unsigned char *)hash; - const unsigned char *target_ = (const unsigned char *)ptarget; - for ( i = 31; i >= 0; i-- ) + if ( unlikely( hash[7] <= ptarget[7] ) ) + if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) ) { - if ( hash_[i] != target_[i] ) + if ( opt_debug ) { - rc = hash_[i] < target_[i]; - break; - } - } - if ( unlikely(rc) ) - { - if ( opt_debug ) - { - bin2hex(hash_str, (unsigned char *)hash, 32); - bin2hex(target_str, (unsigned char *)ptarget, 32); - bin2hex(data_str, (unsigned char *)data, 80); - applog(LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata %s\nhash %s\ntarget %s", thr_id, - data_str, - hash_str, - target_str); + bin2hex( hash_str, (unsigned char *)hash, 32 ); + bin2hex( target_str, (unsigned char *)ptarget, 32 ); + bin2hex( data_str, (unsigned char *)data, 80 ); + applog( LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata %s\nhash %s\ntarget %s", + thr_id, data_str, hash_str, target_str ); } pdata[19] = data[19]; submit_solution( work, hash, mythr ); } - } while (n < max_nonce && !work_restart[thr_id].restart); + } while ( n < max_nonce && !work_restart[thr_id].restart ); pdata[19] = n; - mpf_set_prec_raw(magifpi, prec0); - mpf_set_prec_raw(magifpi0, prec0); - mpf_set_prec_raw(mptmp, prec0); - mpf_set_prec_raw(mpt1, prec0); - mpf_set_prec_raw(mpt2, prec0); - mpf_clear(magifpi); - mpf_clear(magifpi0); - mpf_clear(mpten); - mpf_clear(mptmp); - mpf_clear(mpt1); - mpf_clear(mpt2); - mpz_clears(magipi, magisw, product, bns0, bns1, NULL); + mpf_set_prec_raw( magifpi, prec0 ); + mpf_set_prec_raw( magifpi0, prec0 ); + mpf_set_prec_raw( mptmp, prec0 ); + mpf_set_prec_raw( mpt1, prec0 ); + mpf_set_prec_raw( mpt2, prec0 ); + mpf_clear( magifpi ); + mpf_clear( magifpi0 ); + mpf_clear( mpten ); + mpf_clear( mptmp ); + mpf_clear( mpt1 ); + mpf_clear( mpt2 ); + mpz_clears( magipi, magisw, product, bns0, bns1, NULL ); *hashes_done = n - first_nonce + 1; return 0; diff --git a/algo/nist5/nist5-4way.c b/algo/nist5/nist5-4way.c index 9b8687b..da9a5fa 100644 --- a/algo/nist5/nist5-4way.c +++ b/algo/nist5/nist5-4way.c @@ -102,7 +102,7 @@ int scanhash_nist5_8way( struct work *work, uint32_t max_nonce, nist5hash_8way( hash, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) @@ -190,7 +190,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce, nist5hash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane<<1 ] < Htarg ) + if ( hash7[ lane<<1 ] <= Htarg ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 4eac923..0a3a06e 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -1513,10 +1513,10 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, hmq1725_4way_hash( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane<<1 ] <= Htarg ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg ) ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) { pdata[19] = n + lane; submit_lane_solution( work, lane_hash, mythr, lane ); diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index deddbe0..d4d1718 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -248,11 +248,11 @@ void quark_8way_hash( void *state, const void *input ) jh512_8way_close( &ctx.jh, vhashB ); } - // Final blend, directly to state, only need 32 bytes. - casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] ); - casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] ); - casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] ); - casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] ); + // Final blend, directly to state, only need 32 bytes. + casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] ); + casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] ); + casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] ); + casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] ); } int scanhash_quark_8way( struct work *work, uint32_t max_nonce, @@ -267,23 +267,24 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce, uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; mm512_bswap32_intrlv80_8x64( vdata, pdata ); do { *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); + _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); quark_8way_hash( hash, vdata ); pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 ) + if ( unlikely( hash7[ i<<1 ] <= Htarg ) ) { extr_lane_8x64( lane_hash, hash, i, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) { pdata[19] = n+i; submit_lane_solution( work, lane_hash, mythr, i ); @@ -296,7 +297,6 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce, return 0; } - #elif defined (QUARK_4WAY) typedef struct { @@ -460,8 +460,9 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce, uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated - + int thr_id = mythr->id; + const uint32_t Htarg = ptarget[7]; + mm256_bswap32_intrlv80_4x64( vdata, pdata ); do { @@ -472,10 +473,10 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 ) + if ( unlikely( hash7[ i<<1 ] <= Htarg ) ) { extr_lane_4x64( lane_hash, hash, i, 256 ); - if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) { pdata[19] = n+i; submit_lane_solution( work, lane_hash, mythr, i ); diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c index b79d81b..85a66dd 100644 --- a/algo/qubit/deep.c +++ b/algo/qubit/deep.c @@ -5,7 +5,7 @@ #include #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #else #include "algo/echo/sph_echo.h" @@ -15,10 +15,10 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else +#ifdef __AES__ hashState_echo echo; +#else + sph_echo512_context echo; #endif } deep_ctx_holder; @@ -29,10 +29,10 @@ void init_deep_ctx() { init_luffa( &deep_ctx.luffa, 512 ); cubehashInit( &deep_ctx.cubehash, 512, 16, 32 ); -#ifdef NO_AES_NI - sph_echo512_init( &deep_ctx.echo ); -#else +#ifdef __AES__ init_echo( &deep_ctx.echo, 512 ); +#else + sph_echo512_init( &deep_ctx.echo ); #endif }; @@ -59,12 +59,12 @@ void deep_hash(void *output, const void *input) cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, (const byte*) hash,64); -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, (const void*) hash, 64); - sph_echo512_close(&ctx.echo, (void*) hash); -#else +#ifdef __AES__ update_final_echo ( &ctx.echo, (BitSequence *) hash, (const BitSequence *) hash, 512); +#else + sph_echo512 (&ctx.echo, (const void*) hash, 64); + sph_echo512_close(&ctx.echo, (void*) hash); #endif asm volatile ("emms"); diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c index 5d91685..9948d06 100644 --- a/algo/qubit/qubit-2way.c +++ b/algo/qubit/qubit-2way.c @@ -149,8 +149,8 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce, pdata[19] = n; for ( int lane = 0; lane < 4; lane++ ) - if ( ( hash+(lane<<3) )[7] < Htarg ) - if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark ) + if ( unlikely( ( hash+(lane<<3) )[7] <= Htarg ) ) + if ( likely( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark ) ) { pdata[19] = n + lane; submit_lane_solution( work, hash+(lane<<3), mythr, lane ); @@ -233,10 +233,6 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce, uint32_t *noncep = vdata + 32+3; // 4*8 + 3 int thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); @@ -248,32 +244,27 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce, luffa_2way_init( &qubit_2way_ctx.luffa, 512 ); luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 ); - for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + do { - uint32_t mask = masks[m]; - do - { - be32enc( noncep, n ); - be32enc( noncep+4, n+1 ); - qubit_2way_hash( hash, vdata ); - pdata[19] = n; + be32enc( noncep, n ); + be32enc( noncep+4, n+1 ); + qubit_2way_hash( hash, vdata ); + pdata[19] = n; - if ( !( hash[7] & mask ) ) - if ( fulltest( hash, ptarget) && !opt_benchmark ) - { - pdata[19] = n; - submit_lane_solution( work, hash, mythr, 0 ); - } - if ( !( (hash+8)[7] & mask ) ) - if ( fulltest( hash+8, ptarget) && !opt_benchmark ) - { - pdata[19] = n+1; - submit_lane_solution( work, hash+8, mythr, 1 ); - } - n += 2; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } + if ( unlikely( hash[7] <= Htarg ) ) + if ( likely( fulltest( hash, ptarget) && !opt_benchmark ) ) + { + pdata[19] = n; + submit_lane_solution( work, hash, mythr, 0 ); + } + if ( unlikely( ( (hash+8))[7] <= Htarg ) ) + if ( likely( fulltest( hash+8, ptarget) && !opt_benchmark ) ) + { + pdata[19] = n+1; + submit_lane_solution( work, hash+8, mythr, 1 ); + } + n += 2; + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; return 0; } diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index 349ad26..3794eee 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -7,7 +7,7 @@ #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/shavite/sph_shavite.h" -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #else #include "algo/echo/sph_echo.h" @@ -19,10 +19,10 @@ typedef struct cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else +#ifdef __AES__ hashState_echo echo; +#else + sph_echo512_context echo; #endif } qubit_ctx_holder; @@ -35,10 +35,10 @@ void init_qubit_ctx() cubehashInit(&qubit_ctx.cubehash,512,16,32); sph_shavite512_init(&qubit_ctx.shavite); init_sd(&qubit_ctx.simd,512); -#ifdef NO_AES_NI - sph_echo512_init(&qubit_ctx.echo); -#else +#ifdef __AES__ init_echo(&qubit_ctx.echo, 512); +#else + sph_echo512_init(&qubit_ctx.echo); #endif }; @@ -71,12 +71,12 @@ void qubit_hash(void *output, const void *input) update_final_sd( &ctx.simd, (BitSequence *)hash, (const BitSequence*)hash, 512 ); -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, (const void*) hash, 64); - sph_echo512_close(&ctx.echo, (void*) hash); -#else +#ifdef __AES__ update_final_echo( &ctx.echo, (BitSequence *) hash, (const BitSequence *) hash, 512 ); +#else + sph_echo512 (&ctx.echo, (const void*) hash, 64); + sph_echo512_close(&ctx.echo, (void*) hash); #endif asm volatile ("emms"); diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 25fe8a6..ba531ce 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -3,11 +3,9 @@ #include -// This implementation is deprecated, superseded by VAES in Icelake -// which provides HW based 4 way aes. -// It was created for AVX2 to eliminate interleaving between the -// preceding and following function. -// This code can be removed when current users have reverted to one way. +// This is a fake, it actually does not do parallel AES, that requires VAES. +// This is only intended when the preceding and folllowing functions use the +// same 2x128 interleave. #if defined(__AVX2__) @@ -410,4 +408,94 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); } +void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, + const void *data, size_t len ) +{ + __m256i *h = (__m256i*)ctx->h; + __m128i *iv = (__m128i*)IV512; + + h[0] = m256_const1_128( iv[0] ); + h[1] = m256_const1_128( iv[1] ); + h[2] = m256_const1_128( iv[2] ); + h[3] = m256_const1_128( iv[3] ); + + ctx->ptr = + ctx->count0 = + ctx->count1 = + ctx->count2 = + ctx->count3 = 0; + + unsigned char *buf = ctx->buf; + size_t ptr = ctx->ptr; + + // process full blocks and load buf with remainder. + while ( len > 0 ) + { + size_t clen; + + clen = (sizeof ctx->buf) - ptr; + if ( clen > len << 1 ) + clen = len << 1; + memcpy( buf + ptr, data, clen ); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= (clen >> 1); + if ( ptr == sizeof ctx->buf ) + { + if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) + { + ctx->count1 = ctx->count1 + 1; + if ( ctx->count1 == 0 ) + { + ctx->count2 = ctx->count2 + 1; + if ( ctx->count2 == 0 ) + ctx->count3 = ctx->count3 + 1; + } + } + c512_2way( ctx, buf ); + ptr = 0; + } + } + + uint32_t vp = ptr>>5; + // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 + // Count is misaligned to 16 bits and straddles 2 vectors. + // Use u32 overlay to stage then u16 to load buf. + union + { + uint32_t u32[4]; + uint16_t u16[8]; + } count; + + count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8 + count.u32[1] = ctx->count1; + count.u32[2] = ctx->count2; + count.u32[3] = ctx->count3; + + if ( vp == 0 ) // empty buf, xevan. + { + casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 ); + memset_zero_256( (__m256i*)buf + 1, 5 ); + ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; + } + else // half full buf, everyone else. + { + casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 ); + memset_zero_256( (__m256i*)buf + vp, 6 - vp ); + } + + casti_m256i( buf, 6 ) = m256_const1_128( + _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16( + 0x0200, count.u16[7], count.u16[6], count.u16[5], + count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); + + c512_2way( ctx, buf); + + casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 ); + casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 ); + casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 ); + casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 ); +} + #endif // AVX2 diff --git a/algo/shavite/shavite-hash-2way.h b/algo/shavite/shavite-hash-2way.h index 869bf4a..5770d04 100644 --- a/algo/shavite/shavite-hash-2way.h +++ b/algo/shavite/shavite-hash-2way.h @@ -18,6 +18,8 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data, void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ); void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst, const void *data, size_t len ); +void shavite512_2way_full( shavite512_2way_context *ctx, void *dst, + const void *data, size_t len ); #endif // AVX2 diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index b868119..eed4ba1 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -396,4 +396,96 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 ); } + +void shavite512_4way_full( shavite512_4way_context *ctx, void *dst, + const void *data, size_t len ) +{ + __m512i *h = (__m512i*)ctx->h; + __m128i *iv = (__m128i*)IV512; + + h[0] = m512_const1_128( iv[0] ); + h[1] = m512_const1_128( iv[1] ); + h[2] = m512_const1_128( iv[2] ); + h[3] = m512_const1_128( iv[3] ); + + ctx->ptr = + ctx->count0 = + ctx->count1 = + ctx->count2 = + ctx->count3 = 0; + + unsigned char *buf = ctx->buf; + size_t ptr = ctx->ptr; + + // process full blocks and load buf with remainder. + while ( len > 0 ) + { + size_t clen; + + clen = (sizeof ctx->buf) - ptr; + if ( clen > len << 2 ) + clen = len << 2; + memcpy( buf + ptr, data, clen ); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= (clen >> 2); + if ( ptr == sizeof ctx->buf ) + { + if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 ) + { + ctx->count1 = ctx->count1 + 1; + if ( ctx->count1 == 0 ) + { + ctx->count2 = ctx->count2 + 1; + if ( ctx->count2 == 0 ) + ctx->count3 = ctx->count3 + 1; + } + } + c512_4way( ctx, buf ); + ptr = 0; + } + } + + uint32_t vp = ptr>>6; + // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200 + // Count is misaligned to 16 bits and straddles 2 vectors. + // Use u32 overlay to stage then u16 to load buf. + union + { + uint32_t u32[4]; + uint16_t u16[8]; + } count; + + count.u32[0] = ctx->count0 += (ptr << 1); // ptr/4 * 8 + count.u32[1] = ctx->count1; + count.u32[2] = ctx->count2; + count.u32[3] = ctx->count3; + + if ( vp == 0 ) // empty buf, xevan. + { + casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 ); + memset_zero_512( (__m512i*)buf + 1, 5 ); + ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; + } + else // half full buf, everyone else. + { + casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 ); + memset_zero_512( (__m512i*)buf + vp, 6 - vp ); + } + + casti_m512i( buf, 6 ) = m512_const1_128( + _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); + casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16( + 0x0200, count.u16[7], count.u16[6], count.u16[5], + count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) ); + + c512_4way( ctx, buf); + + casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 ); + casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 ); + casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 ); + casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 ); +} + + #endif // VAES diff --git a/algo/shavite/shavite-hash-4way.h b/algo/shavite/shavite-hash-4way.h index c179566..10ff095 100644 --- a/algo/shavite/shavite-hash-4way.h +++ b/algo/shavite/shavite-hash-4way.h @@ -18,6 +18,8 @@ void shavite512_4way_update( shavite512_4way_context *ctx, const void *data, void shavite512_4way_close( shavite512_4way_context *ctx, void *dst ); void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst, const void *data, size_t len ); +void shavite512_4way_full( shavite512_4way_context *ctx, void *dst, + const void *data, size_t len ); #endif // VAES diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index c8ee6ab..f2652f3 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -1173,6 +1173,91 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval, return 0; } +int simd512_4way_full( simd_4way_context *state, void *hashval, + const void *data, int datalen ) +{ + __m512i *A = (__m512i*)state->A; + + state->hashbitlen = 512; + state->n_feistels = 8; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm512_set4_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + int databitlen = datalen * 8; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_4way_Compress( state, data, 0 ); + databitlen -= bs; + data += 4*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 4*(current/8), data, 4*(len/8) ); + state->count += len; + databitlen -= len; + data += 4*( len/8 ); + current = 0; + SIMD_4way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = current / 8; + memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) ); + SIMD_4way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 4*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + state->buffer[ i+32 ] = l & 0xff; + state->buffer[ i+48 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_4way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 4*( state->hashbitlen / 8 ) ); + return 0; +} + + + #endif // AVX512 //////////////////////////////////// @@ -1929,4 +2014,90 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval, return 0; } +int simd512_2way_full( simd_2way_context *state, void *hashval, + const void *data, int datalen ) +{ + __m256i *A = (__m256i*)state->A; + + state->hashbitlen = 512; + state->n_feistels = 8; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0], + SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + int databitlen = datalen * 8; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_2way_Compress( state, data, 0 ); + + databitlen -= bs; + data += 2*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + + memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 2*(current/8), data, 2*(len/8) ); + state->count += len; + databitlen -= len; + data += 2*( len/8 ); + current = 0; + SIMD_2way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = ( current+7 ) / 8; + memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) ); + SIMD_2way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 2*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_2way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) ); + return 0; +} + + #endif diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h index 69a4835..9aad145 100644 --- a/algo/simd/simd-hash-2way.h +++ b/algo/simd/simd-hash-2way.h @@ -26,6 +26,8 @@ int simd_4way_update( simd_4way_context *state, const void *data, int simd_4way_close( simd_4way_context *state, void *hashval ); int simd_4way_update_close( simd_4way_context *state, void *hashval, const void *data, int databitlen ); +int simd512_4way_full( simd_4way_context *state, void *hashval, + const void *data, int datalen ); #endif @@ -45,5 +47,8 @@ int simd_2way_update( simd_2way_context *state, const void *data, int simd_2way_close( simd_2way_context *state, void *hashval ); int simd_2way_update_close( simd_2way_context *state, void *hashval, const void *data, int databitlen ); +int simd512_2way_full( simd_2way_context *state, void *hashval, + const void *data, int datalen ); + #endif #endif diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 7759d39..528f66f 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -45,18 +45,18 @@ extern "C"{ #endif /* -static const sph_u64 IV256[] = { - SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), - SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), - SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), - SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +static const uint64_t IV256[] = { + 0xCCD044A12FDB3E13, 0xE83590301A79A9EB, + 0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB, + 0xEC06025E74DD7683, 0xE7A436CDC4746251, + 0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13 }; -static const sph_u64 IV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +static const uint64_t IV512[] = { + 0x4903ADFF749C51CE, 0x0D95DE399746DF03, + 0x8FD1934127C79BCE, 0x9A255629FF352CB1, + 0x5DB62599DF6CA7B0, 0xEABE394CA9D5C3F4, + 0x991112C71A75B523, 0xAE18A40B660FCC33 }; */ @@ -372,7 +372,7 @@ do { \ #define UBI_BIG_8WAY(etype, extra) \ do { \ - sph_u64 t0, t1, t2; \ + uint64_t t0, t1, t2; \ __m512i h8; \ __m512i m0 = buf[0]; \ __m512i m1 = buf[1]; \ @@ -391,8 +391,8 @@ do { \ __m512i p5 = m5; \ __m512i p6 = m6; \ __m512i p7 = m7; \ - t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ - t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \ + t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \ TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ TFBIG_8WAY_4e(0); \ TFBIG_8WAY_4o(1); \ @@ -425,7 +425,7 @@ do { \ #define DECL_STATE_BIG_8WAY \ __m512i h0, h1, h2, h3, h4, h5, h6, h7; \ - sph_u64 bcount; + uint64_t bcount; #endif // AVX512 @@ -488,7 +488,7 @@ do { \ // scale buf offset by 4 #define UBI_BIG_4WAY(etype, extra) \ do { \ - sph_u64 t0, t1, t2; \ + uint64_t t0, t1, t2; \ __m256i h8; \ __m256i m0 = buf[0]; \ __m256i m1 = buf[1]; \ @@ -507,8 +507,8 @@ do { \ __m256i p5 = m5; \ __m256i p6 = m6; \ __m256i p7 = m7; \ - t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ - t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \ + t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \ TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ TFBIG_4WAY_4e(0); \ TFBIG_4WAY_4o(1); \ @@ -542,7 +542,7 @@ do { \ #define DECL_STATE_BIG_4WAY \ __m256i h0, h1, h2, h3, h4, h5, h6, h7; \ - sph_u64 bcount; + uint64_t bcount; #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index 3f58e95..73affc5 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -48,14 +48,8 @@ extern "C"{ #endif #include -#include "algo/sha/sph_types.h" #include "simd-utils.h" -// Output size in bits -#define SPH_SIZE_skein256 256 -#define SPH_SIZE_skein512 512 - - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) typedef struct @@ -63,11 +57,11 @@ typedef struct __m512i buf[8]; __m512i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; - sph_u64 bcount; -} sph_skein_8way_big_context __attribute__ ((aligned (128))); + uint64_t bcount; +} skein_8way_big_context __attribute__ ((aligned (128))); -typedef sph_skein_8way_big_context skein512_8way_context; -typedef sph_skein_8way_big_context skein256_8way_context; +typedef skein_8way_big_context skein512_8way_context; +typedef skein_8way_big_context skein256_8way_context; void skein512_8way_init( skein512_8way_context *sc ); void skein512_8way_update( void *cc, const void *data, size_t len ); @@ -84,21 +78,19 @@ typedef struct __m256i buf[8]; __m256i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; - sph_u64 bcount; -} sph_skein_4way_big_context __attribute__ ((aligned (128))); + uint64_t bcount; +} skein_4way_big_context __attribute__ ((aligned (128))); -typedef sph_skein_4way_big_context skein512_4way_context; -typedef sph_skein_4way_big_context skein256_4way_context; +typedef skein_4way_big_context skein512_4way_context; +typedef skein_4way_big_context skein256_4way_context; void skein512_4way_init( skein512_4way_context *sc ); void skein512_4way_update( void *cc, const void *data, size_t len ); void skein512_4way_close( void *cc, void *dst ); -//#define skein512_4way skein512_4way_update void skein256_4way_init( skein256_4way_context *sc ); void skein256_4way_update( void *cc, const void *data, size_t len ); void skein256_4way_close( void *cc, void *dst ); -//#define skein256_4way skein256_4way_update #ifdef __cplusplus } diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c index 529bac4..fb95861 100644 --- a/algo/x11/c11-4way.c +++ b/algo/x11/c11-4way.c @@ -275,7 +275,7 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( ( hash+(i<<3) )[7] < Htarg ) + if ( ( ( hash+(i<<3) )[7] <= Htarg ) && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; @@ -443,37 +443,26 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; // thr_id arg is deprecated __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - c11_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } + c11_4way_hash( hash, vdata ); + pdata[19] = n; + for ( int i = 0; i < 4; i++ ) + if ( ( ( hash+(i<<3) )[7] <= Htarg ) + && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 4; + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; return 0; } diff --git a/algo/x11/c11.c b/algo/x11/c11.c index c18f587..5ebf40d 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -78,11 +78,9 @@ void c11_hash( void *output, const void *input ) sph_bmw512_close( &ctx.bmw, hash ); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash, (const char*)hash, 512 ); #else - sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, hash, 64 ); sph_groestl512_close( &ctx.groestl, hash ); #endif @@ -108,12 +106,12 @@ void c11_hash( void *output, const void *input ) update_final_sd( &ctx.simd, (BitSequence *)hash, (const BitSequence *)hash, 512 ); -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); -#else +#if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif memcpy(output, hash, 32); diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c index 2688009..b148767 100644 --- a/algo/x11/timetravel.c +++ b/algo/x11/timetravel.c @@ -11,10 +11,10 @@ #include "algo/skein/sph_skein.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else +#ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" #endif static __thread uint32_t s_ntime = UINT32_MAX; @@ -28,10 +28,10 @@ typedef struct { sph_keccak512_context keccak; hashState_luffa luffa; cubehashParam cube; -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else +#ifdef __AES__ hashState_groestl groestl; +#else + sph_groestl512_context groestl; #endif } tt_ctx_holder; @@ -47,10 +47,10 @@ void init_tt8_ctx() sph_keccak512_init( &tt_ctx.keccak ); init_luffa( &tt_ctx.luffa, 512 ); cubehashInit( &tt_ctx.cube, 512, 16, 32 ); -#ifdef NO_AES_NI - sph_groestl512_init( &tt_ctx.groestl ); -#else +#ifdef __AES__ init_groestl( &tt_ctx.groestl, 64 ); +#else + sph_groestl512_init( &tt_ctx.groestl ); #endif }; @@ -110,7 +110,10 @@ void timetravel_hash(void *output, const void *input) } break; case 2: -#ifdef NO_AES_NI +#ifdef __AES__ + update_and_final_groestl( &ctx.groestl, (char*)hashB, + (char*)hashA, dataLen*8 ); +#else if ( i == 0 ) { memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl ); @@ -122,19 +125,6 @@ void timetravel_hash(void *output, const void *input) sph_groestl512( &ctx.groestl, hashA, dataLen ); sph_groestl512_close( &ctx.groestl, hashB ); } -#else -// groestl midstate is slower -// if ( i == 0 ) -// { -// memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl ); -// update_and_final_groestl( &ctx.groestl, (char*)hashB, -// (char*)input + midlen, tail*8 ); -// } -// else -// { - update_and_final_groestl( &ctx.groestl, (char*)hashB, - (char*)hashA, dataLen*8 ); -// } #endif break; case 3: @@ -253,13 +243,9 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce, sph_bmw512( &tt_mid.bmw, endiandata, 64 ); break; case 2: -#ifdef NO_AES_NI +#ifndef __AES__ memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) ); sph_groestl512( &tt_mid.groestl, endiandata, 64 ); -#else -// groestl midstate is slower -// memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) ); -// update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 ); #endif break; case 3: diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c index 1566a50..5360be9 100644 --- a/algo/x11/timetravel10.c +++ b/algo/x11/timetravel10.c @@ -12,11 +12,10 @@ #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" - -#ifdef NO_AES_NI - #include "algo/groestl/sph_groestl.h" -#else +#ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" #endif static __thread uint32_t s_ntime = UINT32_MAX; @@ -32,10 +31,10 @@ typedef struct { cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; -#ifdef NO_AES_NI - sph_groestl512_context groestl; -#else +#ifdef __AES__ hashState_groestl groestl; +#else + sph_groestl512_context groestl; #endif } tt10_ctx_holder; @@ -53,10 +52,10 @@ void init_tt10_ctx() cubehashInit( &tt10_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &tt10_ctx.shavite ); init_sd( &tt10_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &tt10_ctx.groestl ); -#else +#ifdef __AES__ init_groestl( &tt10_ctx.groestl, 64 ); +#else + sph_groestl512_init( &tt10_ctx.groestl ); #endif }; @@ -116,7 +115,10 @@ void timetravel10_hash(void *output, const void *input) } break; case 2: -#ifdef NO_AES_NI +#ifdef __AES__ + update_and_final_groestl( &ctx.groestl, (char*)hashB, + (char*)hashA, dataLen*8 ); +#else if ( i == 0 ) { memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl ); @@ -128,19 +130,6 @@ void timetravel10_hash(void *output, const void *input) sph_groestl512( &ctx.groestl, hashA, dataLen ); sph_groestl512_close( &ctx.groestl, hashB ); } -#else -// groestl midstate is slower -// if ( i == 0 ) -// { -// memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl ); -// update_and_final_groestl( &ctx.groestl, (char*)hashB, -// (char*)input + midlen, tail*8 ); -// } -// else -// { - update_and_final_groestl( &ctx.groestl, (char*)hashB, - (char*)hashA, dataLen*8 ); -// } #endif break; case 3: @@ -286,13 +275,9 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce, sph_bmw512( &tt10_mid.bmw, endiandata, 64 ); break; case 2: -#ifdef NO_AES_NI +#ifndef __AES__ memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) ); sph_groestl512( &tt10_mid.groestl, endiandata, 64 ); -#else -// groestl midstate is slower -// memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) ); -// update_groestl( &tt10_mid.groestl, (char*)endiandata, 64*8 ); #endif break; case 3: diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c index df49600..7e56268 100644 --- a/algo/x11/tribus-4way.c +++ b/algo/x11/tribus-4way.c @@ -124,7 +124,7 @@ int scanhash_tribus_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( (hash+(i<<3))[7] < Htarg ) + if ( (hash+(i<<3))[7] <= Htarg ) if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; @@ -209,7 +209,7 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] < Htarg ) + if ( (hash+(i<<3))[7] <= Htarg ) if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; diff --git a/algo/x11/tribus.c b/algo/x11/tribus.c index aa24757..89d1469 100644 --- a/algo/x11/tribus.c +++ b/algo/x11/tribus.c @@ -7,19 +7,19 @@ #include "algo/jh//sph_jh.h" #include "algo/keccak/sph_keccak.h" -#ifdef NO_AES_NI - #include "algo/echo/sph_echo.h" -#else +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" +#else + #include "algo/echo/sph_echo.h" #endif typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else +#ifdef __AES__ hashState_echo echo; +#else + sph_echo512_context echo; #endif } tribus_ctx_holder; @@ -29,10 +29,10 @@ bool tribus_thread_init() { sph_jh512_init( &tribus_ctx.jh ); sph_keccak512_init( &tribus_ctx.keccak ); -#ifdef NO_AES_NI - sph_echo512_init( &tribus_ctx.echo ); -#else +#ifdef __AES__ init_echo( &tribus_ctx.echo, 512 ); +#else + sph_echo512_init( &tribus_ctx.echo ); #endif return true; } @@ -49,12 +49,12 @@ void tribus_hash(void *state, const void *input) sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); sph_keccak512_close( &ctx.keccak, (void*) hash ); -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close (&ctx.echo, hash ); -#else +#ifdef __AES__ update_final_echo( &ctx.echo, (BitSequence *) hash, (const BitSequence *) hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close (&ctx.echo, hash ); #endif memcpy(state, hash, 32); diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index 2fe47a7..44b6c8e 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -275,7 +275,7 @@ int scanhash_x11_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( hash+(i<<3) )[7] < Htarg + if ( ( hash+(i<<3) )[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index cd818a0..4a88d25 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -1,13 +1,10 @@ #include "cpuminer-config.h" #include "x11evo-gate.h" - #include #include #include - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" @@ -15,24 +12,24 @@ #include "algo/cubehash/sph_cubehash.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif - #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; -#else +#ifdef __AES__ hashState_echo echo; hashState_groestl groestl; +#else + sph_groestl512_context groestl; + sph_echo512_context echo; #endif hashState_luffa luffa; cubehashParam cube; @@ -49,12 +46,12 @@ static x11evo_ctx_holder x11evo_ctx __attribute__ ((aligned (64))); void init_x11evo_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init( &x11evo_ctx.groestl ); - sph_echo512_init( &x11evo_ctx.echo ); -#else +#ifdef __AES__ init_echo( &x11evo_ctx.echo, 512 ); init_groestl( &x11evo_ctx.groestl, 64 ); +#else + sph_groestl512_init( &x11evo_ctx.groestl ); + sph_echo512_init( &x11evo_ctx.echo ); #endif init_luffa( &x11evo_ctx.luffa, 512 ); cubehashInit( &x11evo_ctx.cube, 512, 16, 32 ); @@ -106,12 +103,12 @@ void x11evo_hash( void *state, const void *input ) sph_bmw512_close( &ctx.bmw, (char*)hash ); break; case 2: -#ifdef NO_AES_NI +#ifdef __AES__ + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); +#else sph_groestl512( &ctx.groestl, (char*)hash, size ); sph_groestl512_close( &ctx.groestl, (char*)hash ); -#else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); #endif break; case 3: @@ -142,12 +139,12 @@ void x11evo_hash( void *state, const void *input ) update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 ); break; case 10: -#ifdef NO_AES_NI +#ifdef __AES__ + update_final_echo( &ctx.echo, (char*)hash, + (const char*)hash, 512 ); +#else sph_echo512( &ctx.echo, (char*)hash, size ); sph_echo512_close( &ctx.echo, (char*)hash ); -#else - update_final_echo( &ctx.echo, (char*)hash, - (const char*)hash, 512 ); #endif break; } diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c index cfef60a..345bf08 100644 --- a/algo/x11/x11gost-4way.c +++ b/algo/x11/x11gost-4way.c @@ -308,7 +308,7 @@ int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( hash+(i<<3) )[7] < Htarg + if ( ( hash+(i<<3) )[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; @@ -482,37 +482,26 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; - uint64_t htmax[] = { 0, 0xF, 0xFF, - 0xFFF, 0xFFFF, 0x10000000 }; - uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, - 0xFFFFF000, 0xFFFF0000, 0 }; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - for (int m=0; m < 6; m++) - if (Htarg <= htmax[m]) - { - uint32_t mask = masks[m]; - do - { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + do + { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x11gost_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) - if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) - && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) - { - pdata[19] = n+i; - submit_lane_solution( work, hash+(i<<3), mythr, i ); - } - n += 4; - } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); - break; - } + x11gost_4way_hash( hash, vdata ); + pdata[19] = n; + for ( int i = 0; i < 4; i++ ) + if ( ( hash+(i<<3) )[7] <= Htarg + && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + { + pdata[19] = n+i; + submit_lane_solution( work, hash+(i<<3), mythr, i ); + } + n += 4; + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); *hashes_done = n - first_nonce + 1; return 0; } diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index 3cdb7ed..2eeb5c8 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -114,12 +114,12 @@ void x11gost_hash(void *output, const void *input) update_final_sd( &ctx.simd, (BitSequence *)hash, (const BitSequence *)hash, 512 ); -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else +#if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hash); #endif memcpy( output, hash, 32 ); diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c index 8ae7960..71a081e 100644 --- a/algo/x12/x12-4way.c +++ b/algo/x12/x12-4way.c @@ -24,7 +24,6 @@ #if defined(X12_8WAY) - typedef struct { blake512_8way_context blake; bmw512_8way_context bmw; @@ -96,7 +95,6 @@ void x12_8way_hash( void *state, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); shavite512_4way_init( &ctx.shavite ); shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); @@ -151,19 +149,16 @@ void x12_8way_hash( void *state, const void *input ) #endif - simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); #if defined(__VAES__) - echo_4way_init( &ctx.echo, 512 ); echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); echo_4way_init( &ctx.echo, 512 ); echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); groestl512_4way_init( &ctx.groestl, 64 ); groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); @@ -174,7 +169,7 @@ void x12_8way_hash( void *state, const void *input ) dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - + update_final_echo( &ctx.echo, (BitSequence *)hash1, (const BitSequence *) hash1, 512 ); memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) ); diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c index 63b7ab6..ea59095 100644 --- a/algo/x13/phi1612.c +++ b/algo/x13/phi1612.c @@ -1,18 +1,15 @@ #include "phi1612-gate.h" - #include #include #include #include - #include "algo/gost/sph_gost.h" #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" - -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #endif @@ -22,10 +19,10 @@ typedef struct { cubehashParam cube; sph_fugue512_context fugue; sph_gost512_context gost; -#ifdef NO_AES_NI - sph_echo512_context echo; -#else +#ifdef __AES__ hashState_echo echo; +#else + sph_echo512_context echo; #endif } phi_ctx_holder; @@ -40,10 +37,10 @@ void init_phi1612_ctx() cubehashInit( &phi_ctx.cube, 512, 16, 32 ); sph_fugue512_init( &phi_ctx.fugue ); sph_gost512_init( &phi_ctx.gost ); -#ifdef NO_AES_NI - sph_echo512_init( &phi_ctx.echo ); -#else +#ifdef __AES__ init_echo( &phi_ctx.echo, 512 ); +#else + sph_echo512_init( &phi_ctx.echo ); #endif } @@ -64,9 +61,6 @@ void phi1612_hash(void *output, const void *input) sph_skein512( &ctx.skein, input + 64, 16 ); sph_skein512_close( &ctx.skein, hash ); -// sph_skein512( &ctx.skein, input, 80 ); -// sph_skein512_close( &ctx.skein, (void*)hash ); - sph_jh512( &ctx.jh, (const void*)hash, 64 ); sph_jh512_close( &ctx.jh, (void*)hash ); @@ -78,12 +72,12 @@ void phi1612_hash(void *output, const void *input) sph_gost512( &ctx.gost, hash, 64 ); sph_gost512_close( &ctx.gost, hash ); -#ifdef NO_AES_NI - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); -#else +#ifdef __AES__ update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif memcpy(output, hash, 32); diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c index 2173f01..6fac9d8 100644 --- a/algo/x13/x13-4way.c +++ b/algo/x13/x13-4way.c @@ -315,7 +315,7 @@ int scanhash_x13_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( hash+(i<<3) )[7] < Htarg + if ( ( hash+(i<<3) )[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; diff --git a/algo/x13/x13bcd-4way.c b/algo/x13/x13bcd-4way.c index 01fb8a6..a6c7d19 100644 --- a/algo/x13/x13bcd-4way.c +++ b/algo/x13/x13bcd-4way.c @@ -527,7 +527,7 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce, mm256_bswap32_intrlv80_4x64( vdata, pdata ); blake512_4way_init( &x13bcd_ctx_mid ); - blake512_4way( &x13bcd_ctx_mid, vdata, 64 ); + blake512_4way_update( &x13bcd_ctx_mid, vdata, 64 ); do { *noncev = mm256_intrlv_blend_32( mm256_bswap_32( diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c index 9cafa76..e71ce78 100644 --- a/algo/x13/x13sm3-4way.c +++ b/algo/x13/x13sm3-4way.c @@ -227,7 +227,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce, mm256_bswap32_intrlv80_4x64( vdata, pdata ); blake512_4way_init( &x13sm3_ctx_mid ); - blake512_4way( &x13sm3_ctx_mid, vdata, 64 ); + blake512_4way_update( &x13sm3_ctx_mid, vdata, 64 ); for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index 21d6aaf..4bde68e 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -121,12 +121,12 @@ void x13sm3_hash(void *output, const void *input) (const BitSequence *)hash, 512 ); //11---echo--- -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else +#ifdef __AES__ update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hash); #endif uint32_t sm3_hash[32] __attribute__ ((aligned (32))); diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c index 447359e..b282ac8 100644 --- a/algo/x14/polytimos.c +++ b/algo/x14/polytimos.c @@ -1,29 +1,27 @@ #include "polytimos-gate.h" - #include #include #include #include - #include "algo/skein/sph_skein.h" #include "algo/echo/sph_echo.h" #include "algo/fugue//sph_fugue.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/shabal/sph_shabal.h" #include "algo/gost/sph_gost.h" -#ifndef NO_AES_NI +#ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #endif typedef struct { sph_skein512_context skein; - sph_shabal512_context shabal; -#ifdef NO_AES_NI - sph_echo512_context echo; + sph_shabal512_context shabal; +#ifdef __AES__ + hashState_echo echo; #else - hashState_echo echo; + sph_echo512_context echo; #endif - hashState_luffa luffa; + hashState_luffa luffa; sph_fugue512_context fugue; sph_gost512_context gost; } poly_ctx_holder; @@ -33,15 +31,15 @@ poly_ctx_holder poly_ctx; void init_polytimos_ctx() { sph_skein512_init(&poly_ctx.skein); - sph_shabal512_init(&poly_ctx.shabal); -#ifdef NO_AES_NI - sph_echo512_init(&poly_ctx.echo); + sph_shabal512_init(&poly_ctx.shabal); +#ifdef __AES__ + init_echo( &poly_ctx.echo, 512 ); #else - init_echo( &poly_ctx.echo, 512 ); + sph_echo512_init(&poly_ctx.echo); #endif - init_luffa( &poly_ctx.luffa, 512 ); - sph_fugue512_init(&poly_ctx.fugue); - sph_gost512_init(&poly_ctx.gost); + init_luffa( &poly_ctx.luffa, 512 ); + sph_fugue512_init(&poly_ctx.fugue); + sph_gost512_init(&poly_ctx.gost); } void polytimos_hash(void *output, const void *input) @@ -56,12 +54,12 @@ void polytimos_hash(void *output, const void *input) sph_shabal512(&ctx.shabal, hashA, 64); sph_shabal512_close(&ctx.shabal, hashA); -#ifdef NO_AES_NI +#ifdef __AES__ + update_final_echo ( &ctx.echo, (BitSequence *)hashA, + (const BitSequence *)hashA, 512 ); +#else sph_echo512(&ctx.echo, hashA, 64); sph_echo512_close(&ctx.echo, hashA); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashA, - (const BitSequence *)hashA, 512 ); #endif update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA, diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 70d7c06..42f8f43 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -317,7 +317,7 @@ int scanhash_x14_8way( struct work *work, uint32_t max_nonce, uint32_t *hash7 = &(hash[7<<3]); for ( int lane = 0; lane < 8; lane++ ) - if ( hash7[ lane ] < Htarg ) + if ( hash7[ lane ] <= Htarg ) { uint32_t lane_hash[8] __attribute__ ((aligned (64))); extr_lane_8x32( lane_hash, hash, lane, 256 ); @@ -526,7 +526,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce, uint32_t *hash7 = &(hash[7<<2]); for ( int lane = 0; lane < 4; lane++ ) - if ( hash7[ lane ] < Htarg ) + if ( hash7[ lane ] <= Htarg ) { uint32_t lane_hash[8]; extr_lane_4x32( lane_hash, hash, lane, 256 ); diff --git a/algo/x14/x14.c b/algo/x14/x14.c index 0ba16f5..401b084 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -88,11 +88,9 @@ void x14hash(void *output, const void *input) sph_bmw512_close( &ctx.bmw, hash ); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); update_and_final_groestl( &ctx.groestl, (char*)hash, (const char*)hash, 512 ); #else - sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, hash, 64 ); sph_groestl512_close( &ctx.groestl, hash ); #endif @@ -118,12 +116,12 @@ void x14hash(void *output, const void *input) update_final_sd( &ctx.simd, (BitSequence *)hash, (const BitSequence *)hash, 512 ); -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else +#if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hash); #endif sph_hamsi512(&ctx.hamsi, hash, 64); diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index 4af9499..f9feb07 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -360,7 +360,7 @@ int scanhash_x15_8way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( ( hash+(i<<3) )[7] < Htarg ) + if ( ( hash+(i<<3) )[7] <= Htarg ) if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; @@ -588,7 +588,7 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( ( hash+(i<<3) )[7] < Htarg ) + if ( ( hash+(i<<3) )[7] <= Htarg ) if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) { pdata[19] = n+i; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 1eca8a7..73f15fd 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -65,6 +65,7 @@ union _x16r_8way_context_overlay typedef union _x16r_8way_context_overlay x16r_8way_context_overlay; + void x16r_8way_hash( void* output, const void* input ) { uint32_t vhash[24*8] __attribute__ ((aligned (128))); @@ -98,18 +99,16 @@ void x16r_8way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_8way_init( &ctx.blake ); if ( i == 0 ) - blake512_8way_update( &ctx.blake, input, size ); + blake512_8way_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_update( &ctx.blake, vhash, size ); + blake512_8way_full( &ctx.blake, vhash, vhash, size ); } - blake512_8way_close( &ctx.blake, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7, vhash ); break; case BMW: bmw512_8way_init( &ctx.bmw ); @@ -128,40 +127,22 @@ void x16r_8way_hash( void* output, const void* input ) case GROESTL: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 ); #endif - break; + break; case SKEIN: skein512_8way_init( &ctx.skein ); if ( i == 0 ) @@ -206,33 +187,27 @@ void x16r_8way_hash( void* output, const void* input ) break; case LUFFA: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case CUBEHASH: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); + cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case SHAVITE: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else sph_shavite512_init( &ctx.shavite ); @@ -260,54 +235,42 @@ void x16r_8way_hash( void* output, const void* input ) sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); #endif - break; + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)in4, size ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)in5, size ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)in6, size ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)in7, size ); #endif - break; + break; case HAMSI: intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); @@ -317,7 +280,7 @@ void x16r_8way_hash( void* output, const void* input ) hamsi512_8way_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - break; + break; case FUGUE: sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, in0, size ); @@ -380,13 +343,18 @@ void x16r_8way_hash( void* output, const void* input ) sph_whirlpool_close( &ctx.whirlpool, hash7 ); break; case SHA_512: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, size ); + if ( i == 0 ) + sha512_8way_update( &ctx.sha512, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + sha512_8way_update( &ctx.sha512, vhash, size ); + } sha512_8way_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + hash7, vhash ); break; } size = 64; @@ -431,7 +399,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } do @@ -505,15 +473,13 @@ void x16r_4way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way_update( &ctx.blake, input, size ); + blake512_4way_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_update( &ctx.blake, vhash, size ); + blake512_4way_full( &ctx.blake, vhash, vhash, size ); } - blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case BMW: @@ -529,18 +495,10 @@ void x16r_4way_hash( void* output, const void* input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); break; case SKEIN: skein512_4way_init( &ctx.skein ); @@ -580,12 +538,10 @@ void x16r_4way_hash( void* output, const void* input ) break; case LUFFA: intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); dintrlv_2x128_512( hash0, hash1, vhash ); intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); dintrlv_2x128_512( hash2, hash3, vhash ); break; case CUBEHASH: @@ -618,27 +574,21 @@ void x16r_4way_hash( void* output, const void* input ) break; case SIMD: intrlv_2x128( vhash, in0, in1, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_2way_full( &ctx.simd, vhash, vhash, size ); dintrlv_2x128_512( hash0, hash1, vhash ); intrlv_2x128( vhash, in2, in3, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_2way_full( &ctx.simd, vhash, vhash, size ); dintrlv_2x128_512( hash2, hash3, vhash ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); break; case HAMSI: intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); @@ -727,7 +677,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } do diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 8f7e339..1dc9cee 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -39,9 +39,13 @@ #include #endif +#if defined(X21S_8WAY) || defined(X21S_4WAY) + static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; +#endif + #if defined (X21S_8WAY) static __thread uint64_t* x21s_8way_matrix; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 83d4712..a08ad6a 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -72,27 +72,19 @@ void x17_8way_hash( void *state, const void *input ) uint64_t hash7[8] __attribute__ ((aligned (64))); x17_8way_context_overlay ctx; - // 1 Blake - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, input, 80 ); - blake512_8way_close( &ctx.blake, vhash ); + blake512_8way_full( &ctx.blake, vhash, input, 80 ); - // 2 Bmw bmw512_8way_init( &ctx.bmw ); bmw512_8way_update( &ctx.bmw, vhash, 64 ); bmw512_8way_close( &ctx.bmw, vhash ); - // 3 Groestl - #if defined(__VAES__) rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 ); - + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 ); + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); #else @@ -100,65 +92,44 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 ); intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); #endif - // 4 Skein parallel 4 way 64 bit skein512_8way_init( &ctx.skein ); skein512_8way_update( &ctx.skein, vhash, 64 ); skein512_8way_close( &ctx.skein, vhash ); - // 5 JH jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, 64 ); jh512_8way_close( &ctx.jh, vhash ); - // 6 Keccak keccak512_8way_init( &ctx.keccak ); keccak512_8way_update( &ctx.keccak, vhash, 64 ); keccak512_8way_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); - // 7 Luffa - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 ); + luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 ); - // 8 Cubehash - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 ); - - // 9 Shavite + cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); + cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 ); #else @@ -195,20 +166,13 @@ void x17_8way_hash( void *state, const void *input ) #endif - // 10 Simd - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 ); - - // 11 Echo + simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 ); + simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 ); #if defined(__VAES__) - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 ); + echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); @@ -217,36 +181,27 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash4, - (const BitSequence *) hash4, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash5, - (const BitSequence *) hash5, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash6, - (const BitSequence *) hash6, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash7, - (const BitSequence *) hash7, 512 ); - + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)hash4, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)hash5, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)hash6, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)hash7, 64 ); + intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); -#endif - // 12 Hamsi +#endif hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); @@ -255,7 +210,6 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - // 13 Fugue serial sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); @@ -281,7 +235,6 @@ void x17_8way_hash( void *state, const void *input ) sph_fugue512( &ctx.fugue, hash7, 64 ); sph_fugue512_close( &ctx.fugue, hash7 ); - // 14 Shabal, parallel 8 way 32 bit intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -292,7 +245,6 @@ void x17_8way_hash( void *state, const void *input ) dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - // 15 Whirlpool serial sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash0, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); @@ -318,7 +270,6 @@ void x17_8way_hash( void *state, const void *input ) sph_whirlpool( &ctx.whirlpool, hash7, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash7 ); - // 16 SHA512 parallel 64 bit intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); @@ -326,7 +277,6 @@ void x17_8way_hash( void *state, const void *input ) sha512_8way_update( &ctx.sha512, vhash, 64 ); sha512_8way_close( &ctx.sha512, vhash ); - // 17 Haval parallel 32 bit rintrlv_8x64_8x32( vhashA, vhash, 512 ); haval256_5_8way_init( &ctx.haval ); @@ -410,91 +360,59 @@ void x17_4way_hash( void *state, const void *input ) uint64_t hash3[8] __attribute__ ((aligned (64))); x17_4way_context_overlay ctx; - // 1 Blake parallel 4 way 64 bit - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, input, 80 ); - blake512_4way_close( &ctx.blake, vhash ); + blake512_4way_full( &ctx.blake, vhash, input, 80 ); - // 2 Bmw bmw512_4way_init( &ctx.bmw ); bmw512_4way_update( &ctx.bmw, vhash, 64 ); bmw512_4way_close( &ctx.bmw, vhash ); - // Serialize dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - // 3 Groestl - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - // Parallellize intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - // 4 Skein parallel 4 way 64 bit skein512_4way_init( &ctx.skein ); skein512_4way_update( &ctx.skein, vhash, 64 ); skein512_4way_close( &ctx.skein, vhash ); - // 5 JH jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, 64 ); jh512_4way_close( &ctx.jh, vhash ); - // 6 Keccak keccak512_4way_init( &ctx.keccak ); keccak512_4way_update( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - // 7 Luffa parallel 2 way 128 bit rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 ); + luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 ); - // 8 Cubehash - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 ); + cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); + cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); - // 9 Shavite - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_2way_init( &ctx.shavite ); - shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 ); - // 10 Simd - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); + simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); dintrlv_2x128_512( hash0, hash1, vhashA ); dintrlv_2x128_512( hash2, hash3, vhashB ); - // 11 Echo serial - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, 512 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, 512 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, 64 ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, 64 ); - // 12 Hamsi parallel 4 way 64 bit intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); hamsi512_4way_init( &ctx.hamsi ); @@ -503,7 +421,6 @@ void x17_4way_hash( void *state, const void *input ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - // 13 Fugue serial sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); @@ -517,7 +434,6 @@ void x17_4way_hash( void *state, const void *input ) sph_fugue512( &ctx.fugue, hash3, 64 ); sph_fugue512_close( &ctx.fugue, hash3 ); - // 14 Shabal, parallel 4 way 32 bit intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); shabal512_4way_init( &ctx.shabal ); @@ -526,7 +442,6 @@ void x17_4way_hash( void *state, const void *input ) dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); - // 15 Whirlpool serial sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, hash0, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); @@ -540,14 +455,12 @@ void x17_4way_hash( void *state, const void *input ) sph_whirlpool( &ctx.whirlpool, hash3, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - // 16 SHA512 parallel 64 bit intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); sha512_4way_init( &ctx.sha512 ); sha512_4way_update( &ctx.sha512, vhash, 64 ); sha512_4way_close( &ctx.sha512, vhash ); - // 17 Haval parallel 32 bit rintrlv_4x64_4x32( vhashB, vhash, 512 ); haval256_5_4way_init( &ctx.haval ); diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 5880d3f..4ac3dd4 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -11,9 +11,8 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cube-hash-2way.h" -#include "algo/shavite/shavite-hash-2way.h" #include "algo/shavite/sph_shavite.h" -#include "algo/cubehash/cubehash_sse2.h" +#include "algo/shavite/shavite-hash-2way.h" #include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -74,9 +73,7 @@ void xevan_8way_hash( void *output, const void *input ) const int dataLen = 128; xevan_8way_context_overlay ctx __attribute__ ((aligned (64))); - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, input, 80 ); - blake512_8way_close( &ctx.blake, vhash ); + blake512_8way_full( &ctx.blake, vhash, input, 80 ); memset( &vhash[8<<3], 0, 64<<3 ); bmw512_8way_init( &ctx.bmw ); @@ -87,10 +84,8 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 ); + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); @@ -99,30 +94,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, - dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -143,22 +122,16 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen ); + cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen ); #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen ); #else @@ -195,17 +168,13 @@ void xevan_8way_hash( void *output, const void *input ) #endif - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); + simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen ); + simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen ); #if defined(__VAES__) - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 ); + echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen ); + echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); @@ -214,31 +183,23 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash4, - (const BitSequence *) hash4, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash5, - (const BitSequence *) hash5, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash6, - (const BitSequence *) hash6, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash7, - (const BitSequence *) hash7, dataLen<<3 ); - + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)hash4, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)hash5, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)hash6, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)hash7, dataLen ); + intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -328,9 +289,7 @@ void xevan_8way_hash( void *output, const void *input ) memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 ); - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, vhash, dataLen ); - blake512_8way_close(&ctx.blake, vhash); + blake512_8way_full( &ctx.blake, vhash, vhash, dataLen ); bmw512_8way_init( &ctx.bmw ); bmw512_8way_update( &ctx.bmw, vhash, dataLen ); @@ -340,10 +299,8 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 ); + groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen ); + groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); @@ -352,30 +309,14 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, - dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -396,22 +337,16 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen ); + cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen ); #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen ); + shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen ); + shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen ); #else @@ -448,17 +383,13 @@ void xevan_8way_hash( void *output, const void *input ) #endif - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); + simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen ); + simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen ); #if defined(__VAES__) - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 ); + echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen ); + echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen ); rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); @@ -467,30 +398,22 @@ void xevan_8way_hash( void *output, const void *input ) dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash4, - (const BitSequence *) hash4, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash5, - (const BitSequence *) hash5, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash6, - (const BitSequence *) hash6, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash7, - (const BitSequence *) hash7, dataLen<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)hash4, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)hash5, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)hash6, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)hash7, dataLen ); intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); @@ -657,9 +580,7 @@ void xevan_4way_hash( void *output, const void *input ) // parallel 4 way - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, input, 80 ); - blake512_4way_close(&ctx.blake, vhash); + blake512_4way_full( &ctx.blake, vhash, input, 80 ); memset( &vhash[8<<2], 0, 64<<2 ); bmw512_4way_init( &ctx.bmw ); @@ -669,18 +590,10 @@ void xevan_4way_hash( void *output, const void *input ) // Serial dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 ); // Parallel 4way intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -699,15 +612,11 @@ void xevan_4way_hash( void *output, const void *input ) rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen ); + cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen ); shavite512_2way_init( &ctx.shavite ); shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); @@ -722,18 +631,15 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, dataLen ); + // Parallel intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -805,18 +711,10 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, - dataLen<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, - dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -834,15 +732,11 @@ void xevan_4way_hash( void *output, const void *input ) rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen ); + luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen ); + luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen ); - cube_2way_init( &ctx.cube, 512, 16, 32 ); - cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen ); + cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen ); + cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen ); shavite512_2way_init( &ctx.shavite ); shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen ); @@ -857,18 +751,14 @@ void xevan_4way_hash( void *output, const void *input ) dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 ); dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash0, - (const BitSequence *) hash0, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash1, - (const BitSequence *) hash1, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash2, - (const BitSequence *) hash2, dataLen<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo( &ctx.echo, (BitSequence *)hash3, - (const BitSequence *) hash3, dataLen<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)hash0, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)hash1, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)hash2, dataLen ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)hash3, dataLen ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -934,7 +824,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce, uint32_t *hash7 = &(hash[7<<2]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated + int thr_id = mythr->id; __m256i *noncev = (__m256i*)vdata + 9; // aligned const uint32_t Htarg = ptarget[7]; diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 5d912b0..f2ad598 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -167,10 +167,10 @@ void x22i_8way_hash( void *output, const void *input ) #if defined(__VAES__) - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 ); + shavite512_4way_init( &ctx.shavite ); + shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 ); #else @@ -214,12 +214,12 @@ void x22i_8way_hash( void *output, const void *input ) #if defined(__VAES__) - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 ); + echo_4way_init( &ctx.echo, 512 ); + echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 ); - rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); + rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 ); #else @@ -398,16 +398,16 @@ void x22i_8way_hash( void *output, const void *input ) memset( hash7, 0, 64 ); intrlv_2x256( vhash, hashA0, hashA1, 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash0, hash1, vhash, 256 ); intrlv_2x256( vhash, hashA2, hashA3, 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash2, hash3, vhash, 256 ); intrlv_2x256( vhash, hashA4, hashA5, 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash4, hash5, vhash, 256 ); intrlv_2x256( vhash, hashA6, hashA7, 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash6, hash7, vhash, 256 ); sph_gost512_init( &ctx.gost ); diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index e02fba8..7144288 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -408,16 +408,16 @@ void x25x_8way_hash( void *output, const void *input ) sph_tiger_close(&ctx.tiger, (void*) hash7[18]); intrlv_2x256( vhash, hash0[18], hash1[18], 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash0[19], hash1[19], vhash, 256 ); intrlv_2x256( vhash, hash2[18], hash3[18], 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash2[19], hash3[19], vhash, 256 ); intrlv_2x256( vhash, hash4[18], hash5[18], 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash4[19], hash5[19], vhash, 256 ); intrlv_2x256( vhash, hash6[18], hash7[18], 256 ); - LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); + LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 ); dintrlv_2x256( hash6[19], hash7[19], vhash, 256 ); sph_gost512_init(&ctx.gost); diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c index fb39d2e..770db33 100644 --- a/algo/yescrypt/yescrypt.c +++ b/algo/yescrypt/yescrypt.c @@ -401,7 +401,7 @@ int scanhash_yescrypt( struct work *work, uint32_t max_nonce, do { be32enc(&endiandata[19], n); yescrypt_hash((char*) endiandata, (char*) vhash, 80); - if (vhash[7] < Htarg && fulltest(vhash, ptarget ) + if (vhash[7] <= Htarg && fulltest(vhash, ptarget ) && !opt_benchmark ) { pdata[19] = n; diff --git a/algo/yespower/yespower-blake2b.c b/algo/yespower/yespower-blake2b.c index 9c58b1c..8bd70fa 100644 --- a/algo/yespower/yespower-blake2b.c +++ b/algo/yespower/yespower-blake2b.c @@ -615,7 +615,7 @@ static volatile uint64_t Smask2var = Smask2; /* 64-bit without AVX. This relies on out-of-order execution and register * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., * it runs great on Haswell. */ -#warning "Note: using x86-64 inline assembly for pwxform. That's great." +//#warning "Note: using x86-64 inline assembly for pwxform. That's great." #undef MAYBE_MEMORY_BARRIER #define MAYBE_MEMORY_BARRIER \ __asm__("" : : : "memory"); diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 7384869..f9ee5d4 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -55,7 +55,7 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, do { be32enc(&endiandata[19], n); yespower_hash((char*) endiandata, (char*) vhash, 80); - if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) + if ( vhash[7] <= Htarg && fulltest( vhash, ptarget ) && !opt_benchmark ) { pdata[19] = n; diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index b6f76ec..470a4e5 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -529,7 +529,7 @@ static volatile uint64_t Smask2var = Smask2; /* 64-bit without AVX. This relies on out-of-order execution and register * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., * it runs great on Haswell. */ -#warning "Note: using x86-64 inline assembly for pwxform. That's great." +//#warning "Note: using x86-64 inline assembly for pwxform. That's great." #undef MAYBE_MEMORY_BARRIER #define MAYBE_MEMORY_BARRIER \ __asm__("" : : : "memory"); diff --git a/build-allarch.sh b/build-allarch.sh index ea69c63..b6c8df9 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -44,23 +44,23 @@ mv cpuminer.exe cpuminer-aes-sse42.exe strip -s cpuminer mv cpuminer cpuminer-aes-sse42 -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-sse42.exe -strip -s cpuminer -mv cpuminer cpuminer-sse42 +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl +#make -j 16 +#strip -s cpuminer.exe +#mv cpuminer.exe cpuminer-sse42.exe +#strip -s cpuminer +#mv cpuminer cpuminer-sse42 -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl -make -j 16 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-ssse3.exe -strip -s cpuminer -mv cpuminer cpuminer-ssse3 +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl +#make -j 16 +#strip -s cpuminer.exe +#mv cpuminer.exe cpuminer-ssse3.exe +#strip -s cpuminer +#mv cpuminer cpuminer-ssse3 make clean || echo clean rm -f config.status diff --git a/clean-all.sh b/clean-all.sh index 48a233e..29ca81e 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -3,8 +3,8 @@ # imake clean and rm all the targetted executables. # tips to users. -rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen > /dev/null +rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen > /dev/null -rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe > /dev/null +rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe > /dev/null make distclean > /dev/null diff --git a/configure b/configure index 2f8e071..27d9cea 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.5. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.11.2' -PACKAGE_STRING='cpuminer-opt 3.11.2' +PACKAGE_VERSION='3.11.5' +PACKAGE_STRING='cpuminer-opt 3.11.5' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.11.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.11.5 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.11.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.11.5:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.11.2 +cpuminer-opt configure 3.11.5 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.11.2, which was +It was created by cpuminer-opt $as_me 3.11.5, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.11.2' + VERSION='3.11.5' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.11.2, which was +This file was extended by cpuminer-opt $as_me 3.11.5, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.11.2 +cpuminer-opt config.status 3.11.5 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index c2741c6..0b82bfd 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.11.2]) +AC_INIT([cpuminer-opt], [3.11.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 3b29a12..6501f87 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -157,6 +157,7 @@ bool opt_hash_meter = false; uint32_t submitted_share_count= 0; uint32_t accepted_share_count = 0; uint32_t rejected_share_count = 0; +uint32_t stale_share_count = 0; uint32_t solved_block_count = 0; double *thr_hashrates; double global_hashrate = 0; @@ -869,6 +870,7 @@ static struct timeval five_min_start; static double latency_sum = 0.; static uint64_t submit_sum = 0; static uint64_t accept_sum = 0; +static uint64_t stale_sum = 0; static uint64_t reject_sum = 0; static double norm_diff_sum = 0.; static uint32_t last_block_height = 0; @@ -883,6 +885,7 @@ static uint32_t hi_temp = 0; struct share_stats_t { + int share_count; struct timeval submit_time; double net_diff; double share_diff; @@ -892,7 +895,7 @@ struct share_stats_t }; #define s_stats_size 8 -static struct share_stats_t share_stats[ s_stats_size ] = {0}; +static struct share_stats_t share_stats[ s_stats_size ] = {{0}}; static int s_get_ptr = 0, s_put_ptr = 0; static struct timeval last_submit_time = {0}; @@ -921,6 +924,7 @@ void report_summary_log( bool force ) uint64_t submits = submit_sum; submit_sum = 0; uint64_t accepts = accept_sum; accept_sum = 0; uint64_t rejects = reject_sum; reject_sum = 0; + uint64_t stales = stale_sum; stale_sum = 0; // int latency = latency_sum; latency_sum = 0; memcpy( &start_time, &five_min_start, sizeof start_time ); memcpy( &five_min_start, &now, sizeof now ); @@ -976,7 +980,11 @@ void report_summary_log( bool force ) submits, submitted_share_count ); applog2( LOG_INFO,"Accepted %6d %6d", accepts, accepted_share_count ); - applog2( LOG_INFO,"Rejected %6d %6d", + if ( stale_share_count ) + applog2( LOG_INFO,"Stale %6d %6d", + stales, stale_share_count ); + if ( rejected_share_count ) + applog2( LOG_INFO,"Rejected %6d %6d", rejects, rejected_share_count ); if ( solved_block_count ) applog2( LOG_INFO,"Blocks solved %6d", @@ -1012,8 +1020,10 @@ static int share_result( int result, struct work *null_work, struct share_stats_t my_stats = {0}; struct timeval ack_time, latency_tv, et; const char *sres = NULL; + char job_id[48]; bool solved = false; - + bool stale = false; + char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL; // Mutex while we grab a snapshot of the stats. pthread_mutex_lock( &stats_lock ); @@ -1047,7 +1057,7 @@ static int share_result( int result, struct work *null_work, my_stats.net_diff * 100.; // check result - if ( result ) + if ( likely( result ) ) { accepted_share_count++; if ( ( my_stats.net_diff > 0. ) && ( my_stats.share_diff >= net_diff ) ) @@ -1057,13 +1067,17 @@ static int share_result( int result, struct work *null_work, } } else - rejected_share_count++; -/* - result ? accepted_share_count++ : rejected_share_count++; - solved = result && (my_stats.net_diff > 0.0 ) - && ( my_stats.share_diff >= net_diff ); - solved_block_count += solved ? 1 : 0 ; -*/ + { + if ( reason && strstr( reason, "Invalid job id" ) ) + { + stale = true; + stale_share_count++; + } + else + rejected_share_count++; + } + + // update global counters for summary report pthread_mutex_lock( &stats_lock ); @@ -1071,37 +1085,83 @@ static int share_result( int result, struct work *null_work, hashrate += thr_hashrates[i]; global_hashrate = hashrate; - if ( result ) + if ( likely( result ) ) { accept_sum++; norm_diff_sum += my_stats.target_diff; } else - reject_sum++; + { + if ( stale ) + stale_sum++; + else + reject_sum++; + } submit_sum++; latency_sum += latency; pthread_mutex_unlock( &stats_lock ); - if ( use_colors ) - sres = solved ? ( CL_MAG "BLOCK SOLVED" CL_WHT ) - : ( result ? ( CL_GRN "Accepted" CL_WHT ) - : ( CL_RED "Rejected" CL_WHT ) ); - else // monochrome - sres = solved ? "BLOCK SOLVED" : ( result ? "Accepted" : "Rejected" ); + sprintf( job_id, "job %s", my_stats.job_id ); + bcol = acol = scol = rcol = "\0"; - applog( LOG_NOTICE, "%s, %.3f secs (%dms), A/R/B: %d/%d/%d", - sres, share_time, latency, accepted_share_count, - rejected_share_count, solved_block_count ); + if ( use_colors ) + { + if ( likely( result ) ) + { + if ( unlikely( solved ) ) + { + sres = CL_MAG "BLOCK SOLVED" CL_WHT; + bcol = CL_MAG; + } + else + sres = CL_GRN "Accepted" CL_WHT; + acol = CL_GRN; + } +/* + if ( unlikely( solved ) ) + { + sres = CL_MAG "BLOCK SOLVED" CL_WHT; + bcol = CL_MAG; + acol = CL_GRN; + } + else if ( likely( result ) ) + { + sres = CL_GRN "Accepted" CL_WHT; + acol = CL_GRN; + } +*/ + else if ( stale ) + { + sres = CL_YL2 "Stale share" CL_WHT; + scol = CL_YL2; + sprintf( job_id, "%sjob %s%s", CL_YL2, my_stats.job_id, CL_N ); + } + else + { + sres = CL_RED "Rejected" CL_WHT; + rcol = CL_RED; + } + } + else // monochrome + sres = solved ? "BLOCK SOLVED" + : ( result ? "Accepted" + : stale ? "Stale share" : "Rejected" ); + + applog( LOG_NOTICE, "%d: %s, %.3f secs (%dms), %sA:%d" CL_WHT " %sS:%d" CL_WHT " %sR:%d" CL_WHT " %sB:%d" CL_WHT, + my_stats.share_count, sres, share_time, latency, acol, + accepted_share_count, scol, stale_share_count, rcol, + rejected_share_count, bcol, solved_block_count ); if ( have_stratum && !opt_quiet ) - applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d, job %s", + applog2( LOG_INFO, "Share diff %.3g (%5f%%), block %d, %s", my_stats.share_diff, share_ratio, stratum.block_height, - my_stats.job_id ); + job_id ); - if ( reason ) + if ( unlikely( reason && !result ) ) { - applog( LOG_WARNING, "Reject reason: %s", reason ); + if ( !opt_quiet && !stale ) + applog( LOG_WARNING, "Reject reason: %s", reason ); if ( opt_debug ) { @@ -1122,7 +1182,7 @@ static int share_result( int result, struct work *null_work, applog2( LOG_INFO, "Target: %s...", str3 ); } - if ( opt_reset_on_stale && strstr( reason, "Invalid job id" ) ) + if ( unlikely( opt_reset_on_stale && stale ) ) stratum_need_reset = true; } @@ -1760,6 +1820,7 @@ void work_set_target_ratio( struct work* work, uint32_t* hash ) // it can overflow the queue and overwrite stats for a share. pthread_mutex_lock( &stats_lock ); + share_stats[ s_put_ptr ].share_count = submitted_share_count; gettimeofday( &share_stats[ s_put_ptr ].submit_time, NULL ); share_stats[ s_put_ptr ].share_diff = work->sharediff; share_stats[ s_put_ptr ].net_diff = net_diff; @@ -1780,12 +1841,13 @@ bool submit_solution( struct work *work, void *hash, submitted_share_count++; work_set_target_ratio( work, hash ); if ( !opt_quiet ) - applog( LOG_BLUE, "Share %d submitted by thread %d", - submitted_share_count, thr->id ); + applog( LOG_NOTICE, "%d: submitted by thread %d, job %s", + submitted_share_count, thr->id, work->job_id ); return true; } else - applog( LOG_WARNING, "Failed to submit share." ); + applog( LOG_WARNING, "%d: failed to submit share.", + submitted_share_count ); return false; } @@ -1797,12 +1859,13 @@ bool submit_lane_solution( struct work *work, void *hash, submitted_share_count++; work_set_target_ratio( work, hash ); if ( !opt_quiet ) - applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d", - submitted_share_count, thr->id, lane ); + applog( LOG_NOTICE, "%d: submitted by thread %d, lane %d, job %s", + submitted_share_count, thr->id, lane, work->job_id ); return true; } else - applog( LOG_WARNING, "Failed to submit share." ); + applog( LOG_WARNING, "%d: failed to submit share.", + submitted_share_count ); return false; } @@ -1925,18 +1988,13 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id, { uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); -// the job_id check doesn't work as intended, it's a char pointer! -// For stratum the pointers can be dereferenced and the strings compared, -// benchmark not, getwork & gbt unsure. -// || ( have_straum && strcmp( work->job_id, g_work->job_id ) ) ) ) -// or -// || ( !benchmark && strcmp( work->job_id, g_work->job_id ) ) ) ) -// For now leave it as is, it seems stable. -// strtoul seems to work. - if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) - || strtoul( work->job_id, NULL, 16 ) - != strtoul( g_work->job_id, NULL, 16 ) ) ) + bool force_new_work = work->job_id ? strtoul( work->job_id, NULL, 16 ) + != strtoul( g_work->job_id, NULL, 16 ) + : true; + + if ( force_new_work || *nonceptr >= *end_nonce_ptr + || ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) + && clean_job ) ) { work_free( work ); work_copy( work, g_work ); @@ -2092,7 +2150,7 @@ static void *miner_thread( void *userdata ) } // wait for stratum to send first job - if ( have_stratum ) while ( !stratum.job.job_id ) sleep(1); + if ( have_stratum ) while ( !g_work.job_id ) sleep(1); while (1) { @@ -2211,13 +2269,15 @@ static void *miner_thread( void *userdata ) // If unsubmiited nonce(s) found, submit now. if ( nonce_found && !opt_benchmark ) { + applog( LOG_WARNING, "BUG: See RELEASE_NOTES for reporting bugs. Algo = %s.", + algo_names[ opt_algo ] ); if ( !submit_work( mythr, &work ) ) { applog( LOG_WARNING, "Failed to submit share." ); break; } if ( !opt_quiet ) - applog( LOG_BLUE, "Share %d submitted by thread %d.", + applog( LOG_NOTICE, "%d: submitted by thread %d.", accepted_share_count + rejected_share_count + 1, mythr->id ); @@ -2764,25 +2824,6 @@ static void *stratum_thread(void *userdata ) // check if this redundant stratum_disconnect( &stratum ); } -/* - if ( !stratum_socket_full( &stratum, opt_timeout ) ) - { - stratum_errors++; - applog(LOG_ERR, "Stratum connection timeout"); - s = NULL; - } - else - s = stratum_recv_line(&stratum); - if ( !s ) - { - stratum_disconnect(&stratum); - applog(LOG_WARNING, "Stratum connection interrupted"); - continue; - } - if (!stratum_handle_method(&stratum, s)) - stratum_handle_response(s); - free(s); -*/ } // loop out: return NULL; diff --git a/miner.h b/miner.h index 4f60344..c7137ea 100644 --- a/miner.h +++ b/miner.h @@ -6,18 +6,6 @@ #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION #define MAX_CPUS 16 -//#ifndef NO_AES_NI - #ifndef __AES__ - #define NO_AES_NI - #endif -//#endif - -//#if defined(FOUR_WAY) && defined(__AVX2__) -// keep this until all algos remove reference to HASH_4WAY -//#if defined(__AVX2__) -// #define HASH_4WAY -//#endif - #ifdef _MSC_VER #undef USE_ASM /* to fix */ diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 64b8d7b..4ad8df4 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -2857,6 +2857,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0, //#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 ) #define mm256_intrlv_blend_32( hi, lo ) _mm256_blend_epi32( hi, lo, 0x55 ) +// change to _mm256_blend_epi32 +// // Select lanes of 32 byte hash from 2 sources according to control mask. // macro due to 256 bit value arg. #define mm256_blend_hash_4x64( dst, a, b, mask ) \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index c4839a9..a74cb83 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -129,8 +129,8 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6, } // Equivalent of set1, broadcast 64 bit constant to all 64 bit elements. -#define m512_const1_256( i ) _mm512_broadcast_i64x4( i ) -#define m512_const1_128( i ) _mm512_broadcast_i64x2( i ) +#define m512_const1_256( v ) _mm512_broadcast_i64x4( v ) +#define m512_const1_128( v ) _mm512_broadcast_i64x2( v ) #define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) ) #define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) ) #define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) ) @@ -547,8 +547,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Rotate elements from 2 512 bit vectors in place, source arguments // are overwritten. -// These can all be done with 2 permutex2var instructions but they are -// slower than either xor or alignr and require AVX512VBMI. #define mm512_swap1024_512(v1, v2) \ v1 = _mm512_xor_si512(v1, v2); \ diff --git a/sysinfos.c b/sysinfos.c index 704f25c..1bc633c 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -41,7 +41,7 @@ "/sys/devices/platform/coretemp.0/hwmon/hwmon1/temp1_input" #define HWMON_PATH3 \ - "/sys/class/hwmon/hwmon0/temp1_input" + "/sys/devices/platform/coretemp.0/hwmon/hwmon2/temp1_input" #define HWMON_PATH \ "/sys/class/hwmon/hwmon2/temp1_input" @@ -84,21 +84,6 @@ static inline float linux_cputemp(int core) if (!fd) fd = fopen(HWMON_PATH, "r"); - if (!fd) -// fd = fopen(HWMON_ALT1, "r"); - -// if (!fd) - fd = fopen(HWMON_ALT2, "r"); - - if (!fd) - fd = fopen(HWMON_ALT3, "r"); - - if (!fd) - fd = fopen(HWMON_ALT4, "r"); - - if (!fd) - fd = fopen(HWMON_ALT5, "r"); - if (!fd) return tc;