diff --git a/INSTALL_LINUX b/INSTALL_LINUX index 24927b4..5bfef57 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -1,4 +1,6 @@ +These instructions may be out of date, see the Wiki for the latest... +https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source 1. Requirements: --------------- diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS index b61f091..4a3b908 100644 --- a/INSTALL_WINDOWS +++ b/INSTALL_WINDOWS @@ -1,6 +1,6 @@ Instructions for compiling cpuminer-opt for Windows. -Thwaw intructions nay be out of date. Please consult the wiki for +These intructions are out of date. Please consult the wiki for the latest: https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source diff --git a/README.md b/README.md index df33123..c4532bd 100644 --- a/README.md +++ b/README.md @@ -74,53 +74,50 @@ Supported Algorithms argon2d250 argon2d-crds, Credits (CRDS) argon2d500 argon2d-dyn, Dynamic (DYN) argon2d4096 argon2d-uis, Unitus, (UIS) - axiom Shabal-256 MemoHash - blake Blake-256 (SFR) - blake2b Blake2b 256 - blake2s Blake-2 S + blake Blake-256 + blake2b Blake2-512 + blake2s Blake2-256 blakecoin blake256r8 bmw BMW 256 bmw512 BMW 512 - c11 Chaincoin + c11 decred deep Deepcoin (DCN) dmd-gr Diamond-Groestl groestl Groestl coin hex x16r-hex - hmq1725 Espers + hmq1725 hodl Hodlcoin jha Jackpotcoin keccak Maxcoin keccakc Creative coin lbry LBC, LBRY Credits - luffa Luffa - lyra2h Hppcoin + lyra2h lyra2re lyra2 lyra2rev2 lyra2v2 lyra2rev3 lyrav2v3 lyra2z - lyra2z330 Lyra2 330 rows, Zoin (ZOI) - m7m Magi (XMG) - minotaur Ringcoin (RNG) + lyra2z330 + m7m + minotaur + minotaurx myr-gr Myriad-Groestl neoscrypt NeoScrypt(128, 2, 1) nist5 Nist5 pentablake Pentablake phi1612 phi - phi2 Luxcoin (LUX) - phi2-lux identical to phi2 - pluck Pluck:128 (Supcoin) + phi2 polytimos Ninja power2b MicroBitcoin (MBC) quark Quark qubit Qubit scrypt scrypt(1024, 1, 1) (default) scrypt:N scrypt(N, 1, 1) + scryptn2 scrypt(1048576, 1, 1) sha256d Double SHA-256 - sha256q Quad SHA-256, Pyrite (PYE) - sha256t Triple SHA-256, Onecoin (OC) + sha256q Quad SHA-256 + sha256t Triple SHA-256 sha3d Double keccak256 (BSHA3) - shavite3 Shavite3 skein Skein+Sha (Skeincoin) skein2 Double Skein (Woodcoin) skunk Signatum (SIGT) @@ -136,17 +133,17 @@ Supported Algorithms x11 Dash x11evo Revolvercoin x11gost sib (SibCoin) - x12 Galaxie Cash (GCH) - x13 X13 + x12 + x13 x13bcd bcd x13sm3 hsr (Hshare) - x14 X14 - x15 X15 + x14 + x15 x16r x16rv2 - x16rt Gincoin (GIN) - x16rt-veil Veil (VEIL) - x16s Pigeoncoin (PGN) + x16rt + x16rt-veil veil + x16s x17 x21s x22i diff --git a/README.txt b/README.txt index 3698a64..5527f40 100644 --- a/README.txt +++ b/README.txt @@ -73,7 +73,6 @@ third party packages. They often will work and may be used instead of the included version of the files. - If you like this software feel free to donate: BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 58971b5..fa8da1b 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,12 @@ If not what makes it happen or not happen? Change Log ---------- +v3.21.0 + +Added minotaurx algo for stratum only. +Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512. +Other small improvements. + v3.20.3 Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%. @@ -98,12 +104,9 @@ v3.19.8 #370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid url protocol specifier for requesting a secure stratum connection. - The full url, including the protocol, is now displayed in the stratum connect log and the periodic summary log. - Small optimizations to Cubehash, AVX2 & AVX512. - Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512. v3.19.7 diff --git a/algo-gate-api.c b/algo-gate-api.c index 9591c09..73ec286 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -327,6 +327,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break; case ALGO_M7M: rc = register_m7m_algo ( gate ); break; case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break; + case ALGO_MINOTAURX: rc = register_minotaur_algo ( gate ); break; case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break; case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break; case ALGO_NIST5: rc = register_nist5_algo ( gate ); break; diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 8ad1119..0801c9b 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst); void blake256_8way_update_le(void *cc, const void *data, size_t len); void blake256_8way_close_le(void *cc, void *dst); void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, - const void *data ); + void *data ); void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data ); @@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst); void blake256_16way_update_le(void *cc, const void *data, size_t len); void blake256_16way_close_le(void *cc, void *dst); void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, - const void *data ); + void *data ); void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data ); diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index c352bef..722b58f 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -668,6 +668,258 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ } +// Short cut message expansion when the message data is known to be zero. +// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data. + +#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \ +{ \ + a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \ + d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \ + c = _mm256_add_epi32( c, d ); \ + b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ + a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \ + d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \ + c = _mm256_add_epi32( c, d ); \ + b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ +} + +// Message expansion optimized for each round. +#define ROUND256_8WAY_0 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS1 ) ), \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS0 ) ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS3 ) ), \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS2 ) ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS5 ) ), \ + _mm256_set1_epi32( CS4 ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS7 ) , \ + _mm256_set1_epi32( CS6 ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS9 ) , \ + _mm256_set1_epi32( CS8 ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSB ) , \ + _mm256_set1_epi32( CSA ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CSD ) , \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSF ) , \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ) ); \ +} + +#define ROUND256_8WAY_1 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSA ) , \ + _mm256_set1_epi32( CSE ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ), \ + _mm256_set1_epi32( CS4 ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSF ) , \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CS9 ) ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CS6 ) ), \ + _mm256_set1_epi32( CSD ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ), \ + _mm256_set1_epi32( CS1 ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS2 ) ), \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS0 ) ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS7 ) , \ + _mm256_set1_epi32( CSB ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS3 ) , \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS5 ) ) ); \ +} + +#define ROUND256_8WAY_2 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS8 ) , \ + _mm256_set1_epi32( CSB ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS0 ) , \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CSC ) ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS2 ) , \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS5 ) ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CSD ) ), \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CSF ) ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CSE ) , \ + _mm256_set1_epi32( CSA ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ), \ + _mm256_set1_epi32( CS3 ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS1 ) , \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS7 ) ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS4 ) , \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS9 ) ) ); \ +} + +#define ROUND256_8WAY_3 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS9 ) , \ + _mm256_set1_epi32( CS7 ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS1 ) ), \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS3 ) ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ), \ + _mm256_set1_epi32( CSD ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSE ) , \ + _mm256_set1_epi32( CSB ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS6 ) ), \ + _mm256_set1_epi32( CS2 ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSA ) , \ + _mm256_set1_epi32( CS5 ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS0 ) ), \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS4 ) ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CS8 ) ), \ + _mm256_set1_epi32( CSF ) ); \ +} + +#define ROUND256_8WAY_4 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS0 ) , \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS9 ) ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS7 ) , \ + _mm256_set1_epi32( CS5 ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS4 ) ), \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS2 ) ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSF ) , \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CSA ) ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS1 ) , \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CSE ) ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSC ) , \ + _mm256_set1_epi32( CSB ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS8 ) , \ + _mm256_set1_epi32( CS6 ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CSD ) ), \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CS3 ) ) ); \ +} + +#define ROUND256_8WAY_5 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ), \ + _mm256_set1_epi32( CS2 ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSA ) , \ + _mm256_set1_epi32( CS6 ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CSB ) ), \ + _mm256_set1_epi32( CS0 ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS3 ) , \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS8 ) ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CSD ) ), \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CS4 ) ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS5 ) , \ + _mm256_set1_epi32( CS7 ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ), \ + _mm256_set1_epi32( CSF ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS9 ) ), \ + _mm256_set1_epi32( CS1 ) ); \ +} + +#define ROUND256_8WAY_6 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS5 ) , \ + _mm256_set1_epi32( CSC ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CSF ) ), \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CS1 ) ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSD ) , \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CSE ) ) );\ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CSA ) ), \ + _mm256_set1_epi32( CS4 ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS7 ) ), \ + _mm256_set1_epi32( CS0 ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS3 ) , \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS2 ) , \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CS9 ) ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSB ) , \ + _mm256_set1_epi32( CS8 ) ); \ +} + +#define ROUND256_8WAY_7 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CSB ) ), \ + _mm256_set1_epi32( CSD ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSE ) , \ + _mm256_set1_epi32( CS7 ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS1 ) , \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CS9 ) ), \ + _mm256_set1_epi32( CS3 ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS0 ) , \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS5 ) ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CS4 ) ), \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CSF ) ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS6 ) , \ + _mm256_set1_epi32( CS8 ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ), \ + _mm256_set1_epi32( CS2 ) ); \ +} + +#define ROUND256_8WAY_8 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSF ), \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CS6 ) ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS9 ) , \ + _mm256_set1_epi32( CSE ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS3 ) , \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CSB ) ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CS8 ) ), \ + _mm256_set1_epi32( CS0 ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS2 ) , \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CS7 ) ), \ + _mm256_set1_epi32( CSD ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS4 ) ), \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS1 ) ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS5 ) , \ + _mm256_set1_epi32( CSA ) ); \ +} + +#define ROUND256_8WAY_9 \ +{ \ + G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS2 ) , \ + _mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ) ); \ + G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS4 ) , \ + _mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ) ); \ + G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS6 ) , \ + _mm256_set1_epi32( CS7 ) ); \ + G256_8WAY_ALT( V3, V7, VB, VF, \ + _mm256_xor_si256( M1, _mm256_set1_epi32( CS5 ) ), \ + _mm256_set1_epi32( CS1 ) ); \ + G256_8WAY_ALT( V0, V5, VA, VF, \ + _mm256_xor_si256( MF, _mm256_set1_epi32( CSB ) ), \ + _mm256_set1_epi32( CSF ) ); \ + G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSE ) , \ + _mm256_set1_epi32( CS9 ) ); \ + G256_8WAY_ALT( V2, V7, V8, VD, \ + _mm256_xor_si256( M3, _mm256_set1_epi32( CSC ) ), \ + _mm256_set1_epi32( CS3 ) ); \ + G256_8WAY_ALT( V3, V4, V9, VE, \ + _mm256_xor_si256( MD, _mm256_set1_epi32( CS0 ) ), \ + _mm256_xor_si256( M0, _mm256_set1_epi32( CSD ) ) ); \ +} + + #define DECL_STATE32_8WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ sph_u32 T0, T1; @@ -834,9 +1086,9 @@ do { \ } void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, - const void *data ) + void *data ) { - const __m256i *M = (const __m256i*)data; + __m256i *M = (__m256i*)data; __m256i *V = (__m256i*)midstate; const __m256i *H = (const __m256i*)midhash; @@ -857,6 +1109,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, V[14] = m256_const1_32( CS6 ); V[15] = m256_const1_32( CS7 ); +// M[ 0:3 ] contain new message data including unique nonces in M[ 3]. +// M[ 5:12, 14 ] are always zero and not needed or used. +// M[ 4], M[ 13], M[15] are constant and are initialized here. +// M[ 5] is a special case, used as a cache for (M[13] ^ CSC). + + M[ 4] = m256_const1_32( 0x80000000 ); + M[13] = m256_one_32; + M[15] = m256_const1_32( 80*8 ); + + M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) ); + // G0 GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] ); @@ -868,21 +1131,45 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 ); V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] ); - // G2,G3 - GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); - GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + // G2 + // GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); + V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ), + _mm256_xor_si256( _mm256_set1_epi32( CS5 ), M[ 4] ) ); + V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) ); + V[10] = _mm256_add_epi32( V[10], V[14] ); + V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 ); + V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ), + _mm256_set1_epi32( CS4 ) ); + V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 8 ); + V[10] = _mm256_add_epi32( V[10], V[14] ); + V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 7 ); + + // G3 + // GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ), + _mm256_set1_epi32( CS7 ) ); + V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) ); + V[11] = _mm256_add_epi32( V[11], V[15] ); + V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 ); + V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ), + _mm256_set1_epi32( CS6 ) ); + V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 8 ); + V[11] = _mm256_add_epi32( V[11], V[15] ); + V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 7 ); // G4 - V[ 0] = _mm256_add_epi32( V[ 0], - _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) ); + V[ 0] = _mm256_add_epi32( V[ 0], _mm256_set1_epi32( CS9 ) ); + + // G5 + // GS_8WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC ); // G6 V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ), - _mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) ); + _mm256_set1_epi32( CSD ) ); // G7 V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ), - _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) ); + _mm256_set1_epi32( CSF ) ); V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) ); V[ 3] = _mm256_add_epi32( V[ 3], _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) ); @@ -893,47 +1180,40 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, { __m256i *H = (__m256i*)final_hash; const __m256i *h = (const __m256i*)midhash; - const __m256i *v= (const __m256i*)midstate; __m256i V0, V1, V2, V3, V4, V5, V6, V7; __m256i V8, V9, VA, VB, VC, VD, VE, VF; - __m256i M0, M1, M2, M3, M4, M5, M6, M7; - __m256i M8, M9, MA, MB, MC, MD, ME, MF; + __m256i M0, M1, M2, M3, M4, MD, MF; + __m256i MDxorCSC; - V0 = v[ 0]; - V1 = v[ 1]; - V2 = v[ 2]; - V3 = v[ 3]; - V4 = v[ 4]; - V5 = v[ 5]; - V6 = v[ 6]; - V7 = v[ 7]; - V8 = v[ 8]; - V9 = v[ 9]; - VA = v[10]; - VB = v[11]; - VC = v[12]; - VD = v[13]; - VE = v[14]; - VF = v[15]; + V0 = _mm256_load_si256( (__m256i*)midstate + 0 ); + V1 = _mm256_load_si256( (__m256i*)midstate + 1 ); + V2 = _mm256_load_si256( (__m256i*)midstate + 2 ); + V3 = _mm256_load_si256( (__m256i*)midstate + 3 ); + V4 = _mm256_load_si256( (__m256i*)midstate + 4 ); + V5 = _mm256_load_si256( (__m256i*)midstate + 5 ); + V6 = _mm256_load_si256( (__m256i*)midstate + 6 ); + V7 = _mm256_load_si256( (__m256i*)midstate + 7 ); + V8 = _mm256_load_si256( (__m256i*)midstate + 8 ); + V9 = _mm256_load_si256( (__m256i*)midstate + 9 ); + VA = _mm256_load_si256( (__m256i*)midstate + 10 ); + VB = _mm256_load_si256( (__m256i*)midstate + 11 ); + VC = _mm256_load_si256( (__m256i*)midstate + 12 ); + VD = _mm256_load_si256( (__m256i*)midstate + 13 ); + VE = _mm256_load_si256( (__m256i*)midstate + 14 ); + VF = _mm256_load_si256( (__m256i*)midstate + 15 ); - M0 = casti_m256i( data, 0 ); - M1 = casti_m256i( data, 1 ); - M2 = casti_m256i( data, 2 ); - M3 = casti_m256i( data, 3 ); - M4 = casti_m256i( data, 4 ); - M5 = casti_m256i( data, 5 ); - M6 = casti_m256i( data, 6 ); - M7 = casti_m256i( data, 7 ); - M8 = casti_m256i( data, 8 ); - M9 = casti_m256i( data, 9 ); - MA = casti_m256i( data, 10 ); - MB = casti_m256i( data, 11 ); - MC = casti_m256i( data, 12 ); - MD = casti_m256i( data, 13 ); - ME = casti_m256i( data, 14 ); - MF = casti_m256i( data, 15 ); - - // Finish round 0 + M0 = _mm256_load_si256( (__m256i*)data + 0 ); + M1 = _mm256_load_si256( (__m256i*)data + 1 ); + M2 = _mm256_load_si256( (__m256i*)data + 2 ); + M3 = _mm256_load_si256( (__m256i*)data + 3 ); + M4 = _mm256_load_si256( (__m256i*)data + 4 ); + // M5 to MC & ME zero padding & optimised out. + MD = _mm256_load_si256( (__m256i*)data + 13 ); + MF = _mm256_load_si256( (__m256i*)data + 15 ); + // precalculated MD^CSC, used in round0 G6. + MDxorCSC = _mm256_load_si256( (__m256i*)data + 5 ); + + // Finish round 0 with nonce in M3 // G1 V1 = _mm256_add_epi32( V1, _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) ); @@ -947,20 +1227,29 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, VA = _mm256_add_epi32( VA, VF ); V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 ); V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5, - _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) ); + _mm256_set1_epi32( CS8 ) ) ); VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) ); VA = _mm256_add_epi32( VA, VF ); V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 ); // G5 - GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + // GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ), + _mm256_set1_epi32( CSB ) ); + VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) ); + VB = _mm256_add_epi32( VB, VC ); + V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 ); + V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ), + _mm256_set1_epi32( CSA ) ); + VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 8 ); + VB = _mm256_add_epi32( VB, VC ); + V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 ); // G6 VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) ); V8 = _mm256_add_epi32( V8, VD ); V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 ); - V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ), - _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) ); + V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) ); VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) ); V8 = _mm256_add_epi32( V8, VD ); V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 ); @@ -974,19 +1263,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 ); // Remaining rounds - ROUND_S_8WAY( 1 ); - ROUND_S_8WAY( 2 ); - ROUND_S_8WAY( 3 ); - ROUND_S_8WAY( 4 ); - ROUND_S_8WAY( 5 ); - ROUND_S_8WAY( 6 ); - ROUND_S_8WAY( 7 ); - ROUND_S_8WAY( 8 ); - ROUND_S_8WAY( 9 ); - ROUND_S_8WAY( 0 ); - ROUND_S_8WAY( 1 ); - ROUND_S_8WAY( 2 ); - ROUND_S_8WAY( 3 ); + ROUND256_8WAY_1; + ROUND256_8WAY_2; + ROUND256_8WAY_3; + ROUND256_8WAY_4; + ROUND256_8WAY_5; + ROUND256_8WAY_6; + ROUND256_8WAY_7; + ROUND256_8WAY_8; + ROUND256_8WAY_9; + ROUND256_8WAY_0; + ROUND256_8WAY_1; + ROUND256_8WAY_2; + ROUND256_8WAY_3; const __m256i shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, @@ -1010,6 +1299,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, // // Blake-256 16 way AVX512 +// Generic with full inline message expansion #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \ { \ a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \ @@ -1036,6 +1326,257 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ } +// Short cut message expansion when the message data is known to be zero. +// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data. + +#define G256_16WAY_ALT( a, b, c, d, m0, m1 ) \ +{ \ + a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m0 ); \ + d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \ + c = _mm512_add_epi32( c, d ); \ + b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \ + a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m1 ); \ + d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \ + c = _mm512_add_epi32( c, d ); \ + b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \ +} + +// Message expansion optimized for each round. +#define ROUND256_16WAY_0 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS1 ) ), \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS0 ) ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS3 ) ), \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS2 ) ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS5 ) ), \ + _mm512_set1_epi32( CS4 ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS7 ) , \ + _mm512_set1_epi32( CS6 ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS9 ) , \ + _mm512_set1_epi32( CS8 ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSB ) , \ + _mm512_set1_epi32( CSA ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CSD ) , \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSF ) , \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ) ); \ +} + +#define ROUND256_16WAY_1 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSA ) , \ + _mm512_set1_epi32( CSE ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ), \ + _mm512_set1_epi32( CS4 ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSF ) , \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CS9 ) ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CS6 ) ), \ + _mm512_set1_epi32( CSD ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ), \ + _mm512_set1_epi32( CS1 ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS2 ) ), \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS0 ) ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS7 ) , \ + _mm512_set1_epi32( CSB ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS3 ) , \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS5 ) ) ); \ +} + +#define ROUND256_16WAY_2 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS8 ) , \ + _mm512_set1_epi32( CSB ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS0 ) , \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CSC ) ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS2 ) , \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS5 ) ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CSD ) ), \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CSF ) ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CSE ) , \ + _mm512_set1_epi32( CSA ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ), \ + _mm512_set1_epi32( CS3 ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS1 ) , \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS7 ) ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS4 ) , \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS9 ) ) ); \ +} + +#define ROUND256_16WAY_3 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS9 ) , \ + _mm512_set1_epi32( CS7 ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS1 ) ), \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS3 ) ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ), \ + _mm512_set1_epi32( CSD ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSE ) , \ + _mm512_set1_epi32( CSB ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS6 ) ), \ + _mm512_set1_epi32( CS2 ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSA ) , \ + _mm512_set1_epi32( CS5 ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS0 ) ), \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS4 ) ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CS8 ) ), \ + _mm512_set1_epi32( CSF ) ); \ +} + +#define ROUND256_16WAY_4 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS0 ) , \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS9 ) ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS7 ) , \ + _mm512_set1_epi32( CS5 ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS4 ) ), \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS2 ) ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSF ) , \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CSA ) ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS1 ) , \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CSE ) ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSC ) , \ + _mm512_set1_epi32( CSB ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS8 ) , \ + _mm512_set1_epi32( CS6 ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CSD ) ), \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CS3 ) ) ); \ +} + +#define ROUND256_16WAY_5 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ), \ + _mm512_set1_epi32( CS2 ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSA ) , \ + _mm512_set1_epi32( CS6 ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CSB ) ), \ + _mm512_set1_epi32( CS0 ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS3 ) , \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS8 ) ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CSD ) ), \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CS4 ) ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS5 ) , \ + _mm512_set1_epi32( CS7 ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ), \ + _mm512_set1_epi32( CSF ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS9 ) ), \ + _mm512_set1_epi32( CS1 ) ); \ +} + +#define ROUND256_16WAY_6 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS5 ) , \ + _mm512_set1_epi32( CSC ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CSF ) ), \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CS1 ) ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSD ) , \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CSE ) ) );\ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CSA ) ), \ + _mm512_set1_epi32( CS4 ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS7 ) ), \ + _mm512_set1_epi32( CS0 ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS3 ) , \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS2 ) , \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CS9 ) ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSB ) , \ + _mm512_set1_epi32( CS8 ) ); \ +} + +#define ROUND256_16WAY_7 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CSB ) ), \ + _mm512_set1_epi32( CSD ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSE ) , \ + _mm512_set1_epi32( CS7 ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS1 ) , \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CS9 ) ), \ + _mm512_set1_epi32( CS3 ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS0 ) , \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS5 ) ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CS4 ) ), \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CSF ) ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS6 ) , \ + _mm512_set1_epi32( CS8 ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ), \ + _mm512_set1_epi32( CS2 ) ); \ +} + +#define ROUND256_16WAY_8 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSF ), \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CS6 ) ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS9 ) , \ + _mm512_set1_epi32( CSE ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS3 ) , \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CSB ) ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CS8 ) ), \ + _mm512_set1_epi32( CS0 ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS2 ) , \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CS7 ) ), \ + _mm512_set1_epi32( CSD ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS4 ) ), \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS1 ) ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS5 ) , \ + _mm512_set1_epi32( CSA ) ); \ +} + +#define ROUND256_16WAY_9 \ +{ \ + G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS2 ) , \ + _mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ) ); \ + G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS4 ) , \ + _mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ) ); \ + G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS6 ) , \ + _mm512_set1_epi32( CS7 ) ); \ + G256_16WAY_ALT( V3, V7, VB, VF, \ + _mm512_xor_si512( M1, _mm512_set1_epi32( CS5 ) ), \ + _mm512_set1_epi32( CS1 ) ); \ + G256_16WAY_ALT( V0, V5, VA, VF, \ + _mm512_xor_si512( MF, _mm512_set1_epi32( CSB ) ), \ + _mm512_set1_epi32( CSF ) ); \ + G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSE ) , \ + _mm512_set1_epi32( CS9 ) ); \ + G256_16WAY_ALT( V2, V7, V8, VD, \ + _mm512_xor_si512( M3, _mm512_set1_epi32( CSC ) ), \ + _mm512_set1_epi32( CS3 ) ); \ + G256_16WAY_ALT( V3, V4, V9, VE, \ + _mm512_xor_si512( MD, _mm512_set1_epi32( CS0 ) ), \ + _mm512_xor_si512( M0, _mm512_set1_epi32( CSD ) ) ); \ +} + #define DECL_STATE32_16WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ sph_u32 T0, T1; @@ -1208,9 +1749,9 @@ do { \ // second part is run for each nonce using the precalculated midstate and the // hash from the first block. void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, - const void *data ) + void *data ) { - const __m512i *M = (const __m512i*)data; + __m512i *M = (__m512i*)data; __m512i *V = (__m512i*)midstate; const __m512i *H = (const __m512i*)midhash; @@ -1231,10 +1772,21 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, V[14] = m512_const1_32( CS6 ); V[15] = m512_const1_32( CS7 ); +// M[ 0:3 ] contain new message data including unique nonces in M[ 3]. +// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted. +// M[ 4], M[ 13], M[15] are constant and are initialized here. +// M[ 5] is a special case, used as a cache for (M[13] ^ CSC). + + M[ 4] = m512_const1_32( 0x80000000 ); + M[13] = m512_one_32; + M[15] = m512_const1_32( 80*8 ); + + M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) ); + // G0 GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] ); - // G1, nonce is in M[3] + // G1 // GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD ); V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ), _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) ); @@ -1243,14 +1795,35 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 ); V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] ); - // G2,G3 - GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); - GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + // G2 + // GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); + V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ), + _mm512_xor_si512( _mm512_set1_epi32( CS5 ), M[ 4] ) ); + V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 16 ); + V[10] = _mm512_add_epi32( V[10], V[14] ); + V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 12 ); + V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ), + _mm512_set1_epi32( CS4 ) ); + V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 8 ); + V[10] = _mm512_add_epi32( V[10], V[14] ); \ + V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 7 ); + + // G3 + // GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ), + _mm512_set1_epi32( CS7 ) ); + V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 16 ); + V[11] = _mm512_add_epi32( V[11], V[15] ); + V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 12 ); + V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ), + _mm512_set1_epi32( CS6 ) ); + V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 8 ); + V[11] = _mm512_add_epi32( V[11], V[15] ); \ + V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 7 ); // G4 // GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF ); - V[ 0] = _mm512_add_epi32( V[ 0], - _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); + V[ 0] = _mm512_add_epi32( V[ 0], _mm512_set1_epi32( CS9 ) ); // G5 // GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC ); @@ -1258,11 +1831,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, // G6 // GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD ); V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ), - _mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) ); + _mm512_set1_epi32( CSD ) ); // G7 // GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE ); V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ), - _mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) ); + _mm512_set1_epi32( CSF ) ); V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 ); V[ 3] = _mm512_add_epi32( V[ 3], _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) ); @@ -1273,45 +1846,38 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, { __m512i *H = (__m512i*)final_hash; const __m512i *h = (const __m512i*)midhash; - const __m512i *v= (const __m512i*)midstate; __m512i V0, V1, V2, V3, V4, V5, V6, V7; __m512i V8, V9, VA, VB, VC, VD, VE, VF; - __m512i M0, M1, M2, M3, M4, M5, M6, M7; - __m512i M8, M9, MA, MB, MC, MD, ME, MF; + __m512i M0, M1, M2, M3, M4, MD, MF; + __m512i MDxorCSC; - V0 = v[ 0]; - V1 = v[ 1]; - V2 = v[ 2]; - V3 = v[ 3]; - V4 = v[ 4]; - V5 = v[ 5]; - V6 = v[ 6]; - V7 = v[ 7]; - V8 = v[ 8]; - V9 = v[ 9]; - VA = v[10]; - VB = v[11]; - VC = v[12]; - VD = v[13]; - VE = v[14]; - VF = v[15]; + V0 = _mm512_load_si512( (__m512i*)midstate + 0 ); + V1 = _mm512_load_si512( (__m512i*)midstate + 1 ); + V2 = _mm512_load_si512( (__m512i*)midstate + 2 ); + V3 = _mm512_load_si512( (__m512i*)midstate + 3 ); + V4 = _mm512_load_si512( (__m512i*)midstate + 4 ); + V5 = _mm512_load_si512( (__m512i*)midstate + 5 ); + V6 = _mm512_load_si512( (__m512i*)midstate + 6 ); + V7 = _mm512_load_si512( (__m512i*)midstate + 7 ); + V8 = _mm512_load_si512( (__m512i*)midstate + 8 ); + V9 = _mm512_load_si512( (__m512i*)midstate + 9 ); + VA = _mm512_load_si512( (__m512i*)midstate + 10 ); + VB = _mm512_load_si512( (__m512i*)midstate + 11 ); + VC = _mm512_load_si512( (__m512i*)midstate + 12 ); + VD = _mm512_load_si512( (__m512i*)midstate + 13 ); + VE = _mm512_load_si512( (__m512i*)midstate + 14 ); + VF = _mm512_load_si512( (__m512i*)midstate + 15 ); - M0 = casti_m512i( data, 0 ); - M1 = casti_m512i( data, 1 ); - M2 = casti_m512i( data, 2 ); - M3 = casti_m512i( data, 3 ); - M4 = casti_m512i( data, 4 ); - M5 = casti_m512i( data, 5 ); - M6 = casti_m512i( data, 6 ); - M7 = casti_m512i( data, 7 ); - M8 = casti_m512i( data, 8 ); - M9 = casti_m512i( data, 9 ); - MA = casti_m512i( data, 10 ); - MB = casti_m512i( data, 11 ); - MC = casti_m512i( data, 12 ); - MD = casti_m512i( data, 13 ); - ME = casti_m512i( data, 14 ); - MF = casti_m512i( data, 15 ); + M0 = _mm512_load_si512( (__m512i*)data + 0 ); + M1 = _mm512_load_si512( (__m512i*)data + 1 ); + M2 = _mm512_load_si512( (__m512i*)data + 2 ); + M3 = _mm512_load_si512( (__m512i*)data + 3 ); + M4 = _mm512_load_si512( (__m512i*)data + 4 ); + // M5 to MC & ME are zero padding and optimised out + MD = _mm512_load_si512( (__m512i*)data + 13 ); + MF = _mm512_load_si512( (__m512i*)data + 15 ); + // cache for precalculated MD^CSC, used in round0 G6. + MDxorCSC = _mm512_load_si512( (__m512i*)data + 5 ); // Finish round 0 with the nonce (M3) now available // G0 @@ -1336,21 +1902,30 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, VA = _mm512_add_epi32( VA, VF ); V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 ); V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5, - _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) ); + _mm512_set1_epi32( CS8 ) ) ); VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 ); VA = _mm512_add_epi32( VA, VF ); V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 ); // G5 - GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + // GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ), + _mm512_set1_epi32( CSB ) ); + VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 16 ); + VB = _mm512_add_epi32( VB, VC ); + V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 12 ); + V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ), + _mm512_set1_epi32( CSA ) ); + VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 8 ); + VB = _mm512_add_epi32( VB, VC ); + V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 7 ); // G6 // GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD ); VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 ); V8 = _mm512_add_epi32( V8, VD ); V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 ); - V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ), - _mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) ); + V2 = _mm512_add_epi32( V2, _mm512_add_epi32( V7, MDxorCSC ) ); VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 ); V8 = _mm512_add_epi32( V8, VD ); V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 ); @@ -1364,20 +1939,20 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, V9 = _mm512_add_epi32( V9, VE ); V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 ); - // Remaining rounds - ROUND_S_16WAY( 1 ); - ROUND_S_16WAY( 2 ); - ROUND_S_16WAY( 3 ); - ROUND_S_16WAY( 4 ); - ROUND_S_16WAY( 5 ); - ROUND_S_16WAY( 6 ); - ROUND_S_16WAY( 7 ); - ROUND_S_16WAY( 8 ); - ROUND_S_16WAY( 9 ); - ROUND_S_16WAY( 0 ); - ROUND_S_16WAY( 1 ); - ROUND_S_16WAY( 2 ); - ROUND_S_16WAY( 3 ); + // Remaining rounds, optimised + ROUND256_16WAY_1; + ROUND256_16WAY_2; + ROUND256_16WAY_3; + ROUND256_16WAY_4; + ROUND256_16WAY_5; + ROUND256_16WAY_6; + ROUND256_16WAY_7; + ROUND256_16WAY_8; + ROUND256_16WAY_9; + ROUND256_16WAY_0; + ROUND256_16WAY_1; + ROUND256_16WAY_2; + ROUND256_16WAY_3; // Byte swap final hash const __m512i shuf_bswap32 = diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 7fc7e01..7dcb825 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -103,16 +103,16 @@ const uint8_t *sigmaR = sigma[R]; \ BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ - V2 = mm128_shufl2r_64( V[2], V[3] ); \ - V3 = mm128_shufl2r_64( V[3], V[2] ); \ - V6 = mm128_shufl2l_64( V[6], V[7] ); \ - V7 = mm128_shufl2l_64( V[7], V[6] ); \ + V2 = mm128_alignr_64( V[3], V[2] ); \ + V3 = mm128_alignr_64( V[2], V[3] ); \ + V6 = mm128_alignr_64( V[6], V[7] ); \ + V7 = mm128_alignr_64( V[7], V[6] ); \ BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ - V[2] = mm128_shufl2l_64( V2, V3 ); \ - V[3] = mm128_shufl2l_64( V3, V2 ); \ - V[6] = mm128_shufl2r_64( V6, V7 ); \ - V[7] = mm128_shufl2r_64( V7, V6 ); \ + V[2] = mm128_alignr_64( V2, V3 ); \ + V[3] = mm128_alignr_64( V3, V2 ); \ + V[6] = mm128_alignr_64( V7, V6 ); \ + V[7] = mm128_alignr_64( V6, V7 ); \ } #else diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index 98a9da0..9b62444 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -49,12 +49,11 @@ extern "C"{ #define Sb_8W(x0, x1, x2, x3, c) \ do { \ - __m512i cc = _mm512_set1_epi64( c ); \ - x3 = mm512_not( x3 ); \ + const __m512i cc = _mm512_set1_epi64( c ); \ x0 = mm512_xorandnot( x0, x2, cc ); \ tmp = mm512_xorand( cc, x0, x1 ); \ - x0 = mm512_xorand( x0, x2, x3 ); \ - x3 = mm512_xorandnot( x3, x1, x2 ); \ + x0 = mm512_xorandnot( x0, x3, x2 ); \ + x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\ x1 = mm512_xorand( x1, x0, x2 ); \ x2 = mm512_xorandnot( x2, x3, x0 ); \ x0 = mm512_xoror( x0, x1, x3 ); \ @@ -79,7 +78,7 @@ do { \ #define Sb(x0, x1, x2, x3, c) \ do { \ - __m256i cc = _mm256_set1_epi64x( c ); \ + const __m256i cc = _mm256_set1_epi64x( c ); \ x3 = mm256_not( x3 ); \ x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \ tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \ diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 528c4d5..6a90780 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -23,13 +23,26 @@ #include "simd-utils.h" #include "luffa_for_sse2.h" +#if defined(__SSE4_1__) + #define MULT2( a0, a1 ) do \ { \ - __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \ - a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \ - a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \ + __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \ + a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ + a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ } while(0) +#else + +#define MULT2( a0, a1 ) do \ +{ \ + __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \ + a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ + a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ +} while(0) + +#endif + #define STEP_PART(x,c,t)\ SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ @@ -60,13 +73,13 @@ t = _mm_load_si128(&a0);\ a0 = _mm_or_si128(a0,a1);\ a2 = _mm_xor_si128(a2,a3);\ - a1 = _mm_andnot_si128(a1,ALLONE);\ + a1 = mm128_not( a1 );\ a0 = _mm_xor_si128(a0,a3);\ a3 = _mm_and_si128(a3,t);\ a1 = _mm_xor_si128(a1,a3);\ a3 = _mm_xor_si128(a3,a2);\ a2 = _mm_and_si128(a2,a0);\ - a0 = _mm_andnot_si128(a0,ALLONE);\ + a0 = mm128_not( a0 );\ a2 = _mm_xor_si128(a2,a1);\ a1 = _mm_or_si128(a1,a3);\ t = _mm_xor_si128(t,a1);\ @@ -242,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = { __m128i CNS128[32]; -__m128i ALLONE; +#if !defined(__SSE4_1__) __m128i MASK; +#endif HashReturn init_luffa(hashState_luffa *state, int hashbitlen) { int i; state->hashbitlen = hashbitlen; +#if !defined(__SSE4_1__) /* set the lower 32 bits to '1' */ MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); - /* set all bits to '1' */ - ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); +#endif /* set the 32-bit round constant values to the 128-bit data field */ for ( i=0; i<32; i++ ) CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); @@ -352,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, // Optimized for integrals of 16 bytes, good for 64 and 80 byte len int i; state->hashbitlen = hashbitlen; +#if !defined(__SSE4_1__) /* set the lower 32 bits to '1' */ MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); - /* set all bits to '1' */ - ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); +#endif /* set the 32-bit round constant values to the 128-bit data field */ for ( i=0; i<32; i++ ) CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 63709a7..118deef 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, block0_hash[7] = _mm512_set1_epi32( phash[7] ); // Build vectored second block, interleave last 16 bytes of data using - // unique nonces, add padding. + // unique nonces. block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); block_buf[ 3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); - block_buf[ 4] = m512_const1_32( 0x80000000 ); - block_buf[ 5] = - block_buf[ 6] = - block_buf[ 7] = - block_buf[ 8] = - block_buf[ 9] = - block_buf[10] = - block_buf[11] = - block_buf[12] = m512_zero; - block_buf[13] = m512_one_32; - block_buf[14] = m512_zero; - block_buf[15] = m512_const1_32( 80*8 ); // Partialy prehash second block without touching nonces in block_buf[3]. blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); @@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, block0_hash[7] = _mm256_set1_epi32( phash[7] ); // Build vectored second block, interleave last 16 bytes of data using - // unique nonces and add padding. + // unique nonces. block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); - block_buf[ 3] = - _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); - block_buf[ 4] = m256_const1_32( 0x80000000 ); - block_buf[ 5] = - block_buf[ 6] = - block_buf[ 7] = - block_buf[ 8] = - block_buf[ 9] = - block_buf[10] = - block_buf[11] = - block_buf[12] = m256_zero; - block_buf[13] = m256_one_32; - block_buf[14] = m256_zero; - block_buf[15] = m256_const1_32( 80*8 ); + block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, + n+ 3, n+ 2, n+ 1, n ); // Partialy prehash second block without touching nonces blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 995f71c..b50b071 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, block0_hash[7] = _mm512_set1_epi32( phash[7] ); // Build vectored second block, interleave last 16 bytes of data using - // unique nonces and add padding. + // unique nonces. block_buf[ 0] = _mm512_set1_epi32( pdata[16] ); block_buf[ 1] = _mm512_set1_epi32( pdata[17] ); block_buf[ 2] = _mm512_set1_epi32( pdata[18] ); block_buf[ 3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); - block_buf[ 4] = m512_const1_32( 0x80000000 ); - block_buf[ 5] = - block_buf[ 6] = - block_buf[ 7] = - block_buf[ 8] = - block_buf[ 9] = - block_buf[10] = - block_buf[11] = - block_buf[12] = m512_zero; - block_buf[13] = m512_one_32; - block_buf[14] = m512_zero; - block_buf[15] = m512_const1_32( 80*8 ); // Partialy prehash second block without touching nonces in block_buf[3]. blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); @@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, block0_hash[7] = _mm256_set1_epi32( phash[7] ); // Build vectored second block, interleave last 16 bytes of data using - // unique nonces and add padding. + // unique nonces. block_buf[ 0] = _mm256_set1_epi32( pdata[16] ); block_buf[ 1] = _mm256_set1_epi32( pdata[17] ); block_buf[ 2] = _mm256_set1_epi32( pdata[18] ); block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); - block_buf[ 4] = m256_const1_32( 0x80000000 ); - block_buf[ 5] = - block_buf[ 6] = - block_buf[ 7] = - block_buf[ 8] = - block_buf[ 9] = - block_buf[10] = - block_buf[11] = - block_buf[12] = m256_zero; - block_buf[13] = m256_one_32; - block_buf[14] = m256_zero; - block_buf[15] = m256_const1_32( 80*8 ); // Partialy prehash second block without touching nonces blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index dd96d79..de170f8 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, { __m256i A, B, C, D, E, F, G, H; - X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); - X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + // W[9:14] are zero, therefore X[9:13] are also zero and not needed. + // Except X[ 9] which is part of W[ 0] from the third group. + X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] ); + X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ), + SSG2_0x( W[ 2] ) ), W[ 1] ); X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ), W[ 2] ); X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ), @@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, W[ 6] ); X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ), W[ 7] ); - X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ), - W[ 8] ); - X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] ); - X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] ); - X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] ); - X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] ); - X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] ); - X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] ); + X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] ); + X[14] = SSG2_0x( W[15] ); X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] ); + X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] ); + A = _mm256_load_si256( state_in ); B = _mm256_load_si256( state_in + 1 ); C = _mm256_load_si256( state_in + 2 ); @@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, G = _mm256_load_si256( state_mid + 6 ); H = _mm256_load_si256( state_mid + 7 ); -// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); -// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); -// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - #if !defined(__AVX512VL__) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H ); #endif @@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) ); W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) ); W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) ); - W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ), - W[ 2] ) ); - W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ), - W[ 3] ) ); - W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ), - W[ 4] ) ); - W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ), - W[ 5] ) ); - W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ), - W[ 6] ) ); + W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] ); + W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] ); + W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] ); + W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] ); + W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] ); W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ), W[ 7] ) ); W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ), W[ 8] ) ); SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); - SHA256x8_MSG_EXPANSION( W ); + + W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ), + W[ 9] ) ); + W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); + W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); SHA256x8_MSG_EXPANSION( W ); SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); @@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, { __m512i A, B, C, D, E, F, G, H; - // precalculate constant part msg expansion for second iteration. - X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); - X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + // X is pre-expanded constant part of msg for second group, rounds 16 to 31. + // W[9:14] are zero, therefore X[9:13] are also zero and not needed. + // Except X[ 9] which is used to pre-expand part of W[ 0] from the third + // group, rounds 32 to 48. + X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] ); + X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ), + SSG2_0x16( W[ 2] ) ), W[ 1] ); X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ), W[ 2] ); X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ), @@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, W[ 6] ); X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ), W[ 7] ); - X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ), - W[ 8] ); - X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] ); - X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] ); - X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] ); - X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] ); - X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] ); - X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] ); + X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] ); + X[14] = SSG2_0x16( W[15] ); X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] ); + X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] ); + A = _mm512_load_si512( state_in ); B = _mm512_load_si512( state_in + 1 ); C = _mm512_load_si512( state_in + 2 ); @@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - // update precalculated msg expansion with new nonce: W[3]. + // inject nonce, W[3], to complete msg expansion. W[ 0] = X[ 0]; W[ 1] = X[ 1]; W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) ); @@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) ); W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) ); W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) ); - W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ), - W[ 2] ) ); - W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ), - W[ 3] ) ); - W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ), - W[ 4] ) ); - W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ), - W[ 5] ) ); - W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ), - W[ 6] ) ); + W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] ); + W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] ); + W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] ); + W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] ); + W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] ); W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ), W[ 7] ) ); W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ), W[ 8] ) ); SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); - SHA256x16_MSG_EXPANSION( W ); + + W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ), + W[ 9] ) ); + W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); + W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); SHA256x16_MSG_EXPANSION( W ); SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); @@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, { __m512i A, B, C, D, E, F, G, H; __m512i W[16]; memcpy_512( W, data, 16 ); - // Value for H at round 60, before adding K, to produce valid final hash - //where H == 0. + // Value for H at round 60, before adding K, needed to produce valid final + // hash where H == 0. // H_ = -( H256[7] + K256[60] ); const __m512i H_ = m512_const1_32( 0x136032ED ); diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index 856a07f..1e69bd5 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] = #define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s )) +#define REDUCE4w(x) \ + _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \ + _mm512_srai_epi16( x, 8 ) ) + +/* #define REDUCE4w(x) \ _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \ 0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) ) +*/ -#define EXTRA_REDUCE_S4w(x)\ +#define EXTRA_REDUCE_S4w(x) \ _mm512_sub_epi16( x, _mm512_and_si512( \ m512_const1_64( 0x0101010101010101 ), \ _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \ @@ -400,8 +406,8 @@ static const m512_v16 FFT256_Twiddle4w[] = #define DO_REDUCE_FULL_S4w(i) \ do { \ - X(i) = REDUCE4w( X(i) ); \ - X(i) = EXTRA_REDUCE_S4w( X(i) ); \ + X(i) = REDUCE4w( X(i) ); \ + X(i) = EXTRA_REDUCE_S4w( X(i) ); \ } while(0) @@ -431,10 +437,6 @@ void fft64_4way( void *a ) // Unrolled decimation in frequency (DIF) radix-2 NTT. // Output data is in revbin_permuted order. - static const int w[] = {0, 2, 4, 6}; -// __m256i *Twiddle = (__m256i*)FFT64_Twiddle; - - // targetted #define BUTTERFLY_0( i,j ) \ do { \ @@ -443,25 +445,25 @@ do { \ X(i) = _mm512_sub_epi16( X(i), v ); \ } while(0) -#define BUTTERFLY_N( i,j,n ) \ +#define BUTTERFLY_N( i, j, w ) \ do { \ __m512i v = X(j); \ X(j) = _mm512_add_epi16( X(i), X(j) ); \ - X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \ + X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \ } while(0) BUTTERFLY_0( 0, 4 ); - BUTTERFLY_N( 1, 5, 1 ); - BUTTERFLY_N( 2, 6, 2 ); - BUTTERFLY_N( 3, 7, 3 ); + BUTTERFLY_N( 1, 5, 2 ); + BUTTERFLY_N( 2, 6, 4 ); + BUTTERFLY_N( 3, 7, 6 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); - BUTTERFLY_N( 1, 3, 2 ); - BUTTERFLY_N( 5, 7, 2 ); + BUTTERFLY_N( 1, 3, 4 ); + BUTTERFLY_N( 5, 7, 4 ); DO_REDUCE( 1 ); @@ -501,12 +503,11 @@ do { \ // Transpose the FFT state with a revbin order permutation // on the rows and the column. // This will make the full FFT_64 in order. -#define INTERLEAVE(i,j) \ +#define INTERLEAVE( i, j ) \ do { \ - __m512i t1= X(i); \ - __m512i t2= X(j); \ - X(i) = _mm512_unpacklo_epi16( t1, t2 ); \ - X(j) = _mm512_unpackhi_epi16( t1, t2 ); \ + __m512i u = X(j); \ + X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \ + X(i) = _mm512_unpacklo_epi16( X(i), u ); \ } while(0) INTERLEAVE( 1, 0 ); @@ -534,10 +535,10 @@ do { \ } while(0) -#define BUTTERFLY_N( i,j,n ) \ +#define BUTTERFLY_N( i, j, w ) \ do { \ __m512i u = X(j); \ - X(i) = _mm512_slli_epi16( X(i), w[n] ); \ + X(i) = _mm512_slli_epi16( X(i), w ); \ X(j) = _mm512_sub_epi16( X(j), X(i) ); \ X(i) = _mm512_add_epi16( u, X(i) ); \ } while(0) @@ -558,15 +559,15 @@ do { \ BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); - BUTTERFLY_N( 1, 3, 2 ); - BUTTERFLY_N( 5, 7, 2 ); + BUTTERFLY_N( 1, 3, 4 ); + BUTTERFLY_N( 5, 7, 4 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 4 ); - BUTTERFLY_N( 1, 5, 1 ); - BUTTERFLY_N( 2, 6, 2 ); - BUTTERFLY_N( 3, 7, 3 ); + BUTTERFLY_N( 1, 5, 2 ); + BUTTERFLY_N( 2, 6, 4 ); + BUTTERFLY_N( 3, 7, 6 ); DO_REDUCE_FULL_S4w( 0 ); DO_REDUCE_FULL_S4w( 1 ); @@ -599,7 +600,6 @@ void fft128_4way( void *a ) // Temp space to help for interleaving in the end __m512i B[8]; __m512i *A = (__m512i*) a; -// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; /* Size-2 butterflies */ for ( i = 0; i<8; i++ ) @@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final ) __m512i *X = (__m512i*)x; __m512i *A = (__m512i*)a; -// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; #define UNPACK( i ) \ do { \ @@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final ) __m512i *X = (__m512i*)x; __m512i *A = (__m512i*)a; -// __m256i *Twiddle = (__m256i*)FFT256_Twiddle; #define UNPACK( i ) \ do { \ @@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) // We split the round function in two halfes // so as to insert some independent computations in between -// generic -#if 0 -#define SUM7_00 0 -#define SUM7_01 1 -#define SUM7_02 2 -#define SUM7_03 3 -#define SUM7_04 4 -#define SUM7_05 5 -#define SUM7_06 6 - -#define SUM7_10 1 -#define SUM7_11 2 -#define SUM7_12 3 -#define SUM7_13 4 -#define SUM7_14 5 -#define SUM7_15 6 -#define SUM7_16 0 - -#define SUM7_20 2 -#define SUM7_21 3 -#define SUM7_22 4 -#define SUM7_23 5 -#define SUM7_24 6 -#define SUM7_25 0 -#define SUM7_26 1 - -#define SUM7_30 3 -#define SUM7_31 4 -#define SUM7_32 5 -#define SUM7_33 6 -#define SUM7_34 0 -#define SUM7_35 1 -#define SUM7_36 2 - -#define SUM7_40 4 -#define SUM7_41 5 -#define SUM7_42 6 -#define SUM7_43 0 -#define SUM7_44 1 -#define SUM7_45 2 -#define SUM7_46 3 - -#define SUM7_50 5 -#define SUM7_51 6 -#define SUM7_52 0 -#define SUM7_53 1 -#define SUM7_54 2 -#define SUM7_55 3 -#define SUM7_56 4 - -#define SUM7_60 6 -#define SUM7_61 0 -#define SUM7_62 1 -#define SUM7_63 2 -#define SUM7_64 3 -#define SUM7_65 4 -#define SUM7_66 5 - -#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a) - -#define PERM_0(d,a) /* XOR 1 */ \ -do { \ - d##l = shufxor( a##l, 1 ); \ - d##h = shufxor( a##h, 1 ); \ - } while(0) - -#define PERM_1(d,a) /* XOR 6 */ \ -do { \ - d##l = shufxor( a##h, 2 ); \ - d##h = shufxor( a##l, 2 ); \ -} while(0) - -#define PERM_2(d,a) /* XOR 2 */ \ -do { \ - d##l = shufxor( a##l, 2 ); \ - d##h = shufxor( a##h, 2 ); \ -} while(0) - -#define PERM_3(d,a) /* XOR 3 */ \ -do { \ - d##l = shufxor( a##l, 3 ); \ - d##h = shufxor( a##h, 3 ); \ -} while(0) - -#define PERM_4(d,a) /* XOR 5 */ \ -do { \ - d##l = shufxor( a##h, 1 ); \ - d##h = shufxor( a##l, 1 ); \ -} while(0) - -#define PERM_5(d,a) /* XOR 7 */ \ -do { \ - d##l = shufxor( a##h, 3 ); \ - d##h = shufxor( a##l, 3 ); \ -} while(0) - -#define PERM_6(d,a) /* XOR 4 */ \ -do { \ - d##l = a##h; \ - d##h = a##l; \ -} while(0) -#endif - // targetted #define STEP_1_(a,b,c,d,w,fun,r,s,z) \ diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index c607c52..1216441 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -18,6 +18,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sph_sha2.h" +#include "algo/yespower/yespower.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -31,6 +32,9 @@ // Config #define MINOTAUR_ALGO_COUNT 16 +static const yespower_params_t minotaurx_yespower_params = + { YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 }; + typedef struct TortureNode TortureNode; typedef struct TortureGarden TortureGarden; @@ -59,20 +63,22 @@ struct TortureGarden sph_shabal512_context shabal; sph_whirlpool_context whirlpool; sph_sha512_context sha512; - - struct TortureNode { + struct TortureNode + { unsigned int algo; TortureNode *child[2]; } nodes[22]; } __attribute__ ((aligned (64))); // Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index -static void get_hash( void *output, const void *input, TortureGarden *garden, - unsigned int algo ) +static int get_hash( void *output, const void *input, TortureGarden *garden, + unsigned int algo, int thr_id ) { unsigned char hash[64] __attribute__ ((aligned (64))); + int rc = 1; - switch (algo) { + switch ( algo ) + { case 0: sph_blake512_init(&garden->blake); sph_blake512(&garden->blake, input, 64); @@ -97,14 +103,14 @@ static void get_hash( void *output, const void *input, TortureGarden *garden, sph_echo512(&garden->echo, input, 64); sph_echo512_close(&garden->echo, hash); #endif - break; + break; case 4: #if defined(__AES__) fugue512_full( &garden->fugue, hash, input, 64 ); #else sph_fugue512_full( &garden->fugue, hash, input, 64 ); #endif - break; + break; case 5: #if defined(__AES__) groestl512_full( &garden->groestl, (char*)hash, (char*)input, 512 ); @@ -113,7 +119,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden, sph_groestl512(&garden->groestl, input, 64); sph_groestl512_close(&garden->groestl, hash); #endif - break; + break; case 6: sph_hamsi512_init(&garden->hamsi); sph_hamsi512(&garden->hamsi, input, 64); @@ -164,16 +170,20 @@ static void get_hash( void *output, const void *input, TortureGarden *garden, sph_whirlpool(&garden->whirlpool, input, 64); sph_whirlpool_close(&garden->whirlpool, hash); break; + case 16: // minotaurx only, yespower hardcoded for last node + rc = yespower_tls( input, 64, &minotaurx_yespower_params, + (yespower_binary_t*)hash, thr_id ); } memcpy(output, hash, 64); + return rc; } static __thread TortureGarden garden; bool initialize_torture_garden() { - // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete). + // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete). garden.nodes[ 0].child[0] = &garden.nodes[ 1]; garden.nodes[ 0].child[1] = &garden.nodes[ 2]; @@ -219,7 +229,6 @@ bool initialize_torture_garden() garden.nodes[20].child[1] = &garden.nodes[21]; garden.nodes[21].child[0] = NULL; garden.nodes[21].child[1] = NULL; - return true; } @@ -227,38 +236,45 @@ bool initialize_torture_garden() int minotaur_hash( void *output, const void *input, int thr_id ) { unsigned char hash[64] __attribute__ ((aligned (64))); + int rc = 1; // Find initial sha512 hash sph_sha512_init( &garden.sha512 ); sph_sha512( &garden.sha512, input, 80 ); sph_sha512_close( &garden.sha512, hash ); - - // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce - // if Hamsi is needed but only the first and last functions are - // currently known. Abort if either is Hamsi. - if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 ) - || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) ) - return 0; + + if ( opt_algo != ALGO_MINOTAURX ) + { + // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce + // if Hamsi is needed but only the first and last functions are + // currently known. Abort if either is Hamsi. + if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 ) + || ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) ) + return 0; + } // Assign algos to torture garden nodes based on initial hash for ( int i = 0; i < 22; i++ ) garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT; + // MinotaurX override algo for last node with yespower + if ( opt_algo == ALGO_MINOTAURX ) + garden.nodes[21].algo = MINOTAUR_ALGO_COUNT; + // Send the initial hash through the torture garden TortureNode *node = &garden.nodes[0]; - - while ( node ) + while ( rc && node ) { - get_hash( hash, hash, &garden, node->algo ); + rc = get_hash( hash, hash, &garden, node->algo, thr_id ); node = node->child[ hash[63] & 1 ]; } memcpy( output, hash, 32 ); - return 1; + return rc; } int scanhash_minotaur( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) + uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t edata[20] __attribute__((aligned(64))); uint32_t hash[8] __attribute__((aligned(64))); @@ -277,7 +293,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce, edata[19] = n; if ( likely( algo_gate.hash( hash, edata, thr_id ) ) ) { - if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) + if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) { pdata[19] = bswap_32( n ); submit_solution( work, hash, mythr ); @@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce, return 0; } +// hash function has hooks for minotaurx bool register_minotaur_algo( algo_gate_t* gate ) { - gate->scanhash = (void*)&scanhash_minotaur; - gate->hash = (void*)&minotaur_hash; - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + gate->scanhash = (void*)&scanhash_minotaur; + gate->hash = (void*)&minotaur_hash; gate->miner_thread_init = (void*)&initialize_torture_garden; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT; + if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT; return true; }; diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index 5e725af..52b566b 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -1136,10 +1136,14 @@ int yespower(yespower_local_t *local, ctx.S0 = S; ctx.S1 = S + Swidth_to_Sbytes1( Swidth ); - // copy prehash, do tail - memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx ); - sha256_update( &sha256_ctx, src+64, srclen-64 ); - sha256_final( &sha256_ctx, sha256 ); + if ( srclen == 80 ) // assume 64 byte prehash was done + { + memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx ); + sha256_update( &sha256_ctx, src+64, srclen-64 ); + sha256_final( &sha256_ctx, sha256 ); + } + else + sha256_full( sha256, src, srclen ); if ( version == YESPOWER_0_5 ) { diff --git a/configure b/configure index 083de60..604cb61 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.3. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.20.3' -PACKAGE_STRING='cpuminer-opt 3.20.3' +PACKAGE_VERSION='3.21.0' +PACKAGE_STRING='cpuminer-opt 3.21.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.20.3 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.21.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.20.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.21.0:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.20.3 +cpuminer-opt configure 3.21.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.20.3, which was +It was created by cpuminer-opt $as_me 3.21.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.20.3' + VERSION='3.21.0' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.20.3, which was +This file was extended by cpuminer-opt $as_me 3.21.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.20.3 +cpuminer-opt config.status 3.21.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index d8005bb..6bda2c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.20.3]) +AC_INIT([cpuminer-opt], [3.21.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 7853d3d..5171196 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -131,10 +131,9 @@ bool opt_verify = false; static bool opt_stratum_keepalive = false; static struct timeval stratum_keepalive_timer; // Stratum typically times out in 5 minutes or 300 seconds -#define stratum_keepalive_timeout 180 // 3 minutes +#define stratum_keepalive_timeout 150 // 2.5 minutes static struct timeval stratum_reset_time; - // pk_buffer_size is used as a version selector by b58 code, therefore // it must be set correctly to work. const int pk_buffer_size_max = 26; @@ -2192,6 +2191,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) } // !quiet } // new diff/block +/* if ( new_job && !( opt_quiet || stratum_errors ) ) { int mismatch = submitted_share_count - ( accepted_share_count @@ -2202,6 +2202,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count ); } +*/ } static void *miner_thread( void *userdata ) @@ -2446,8 +2447,8 @@ static void *miner_thread( void *userdata ) { scale_hash_for_display( &hashrate, hr_units ); sprintf( hr, "%.2f", hashrate ); - applog( LOG_INFO, "CPU #%d: %s %sh/s", - thr_id, hr, hr_units ); + applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s", + thr_id, thread_affinity_map[ thr_id ], hr, hr_units ); } } @@ -2887,7 +2888,7 @@ static void *stratum_thread(void *userdata ) else timeval_subtract( &et, &now, &stratum_reset_time ); - if ( et.tv_sec > stratum_keepalive_timeout + 60 ) + if ( et.tv_sec > stratum_keepalive_timeout + 90 ) { applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" ); stratum_need_reset = true; diff --git a/miner.h b/miner.h index 9d2329a..9096796 100644 --- a/miner.h +++ b/miner.h @@ -118,7 +118,7 @@ static inline bool is_windows(void) static inline uint32_t swab32(uint32_t v) { #ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap32(v); + return __builtin_bswap32(v); #else return bswap_32(v); #endif @@ -559,6 +559,7 @@ enum algos { ALGO_LYRA2Z330, ALGO_M7M, ALGO_MINOTAUR, + ALGO_MINOTAURX, ALGO_MYR_GR, ALGO_NEOSCRYPT, ALGO_NIST5, @@ -652,6 +653,7 @@ static const char* const algo_names[] = { "lyra2z330", "m7m", "minotaur", + "minotaurx", "myr-gr", "neoscrypt", "nist5", @@ -813,6 +815,7 @@ Options:\n\ m7m Magi (XMG)\n\ myr-gr Myriad-Groestl\n\ minotaur\n\ + minotaurx\n\ neoscrypt NeoScrypt(128, 2, 1)\n\ nist5 Nist5\n\ pentablake 5 x blake512\n\ diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 6627a94..4b365c3 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -193,8 +193,17 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m ) // Basic operations without equivalent SIMD intrinsic // Bitwise not (~v) +#if defined(__AVX512VL__) + +static inline __m128i mm128_not( const __m128i v ) +{ return _mm_ternarylogic_epi64( v, v, v, 1 ); } + +#else + #define mm128_not( v ) _mm_xor_si128( v, m128_neg1 ) +#endif + // Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) @@ -439,7 +448,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // // Limited 2 input shuffle, combines shuffle with blend. The destination low -// half is always taken from src a, and the high half from src b. +// half is always taken from v1, and the high half from v2. #define mm128_shuffle2_64( v1, v2, c ) \ _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \ _mm_castsi128_pd( v2 ), c ) ); @@ -600,9 +609,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) #endif // SSSE3 else SSE2 -// -// Rotate in place concatenated 128 bit vectors as one 256 bit vector. - // Swap 128 bit vectors. // This should be avoided, it's more efficient to switch references. #define mm128_swap256_128( v1, v2 ) \ @@ -611,61 +617,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) v1 = _mm_xor_si128( v1, v2 ); -// Two input shuffle-rotate. -// Concatenate v1 & v2 and byte rotate as a 256 bit vector. -// Function macros with two inputs and one output, inputs are preserved. -// Returns the high 128 bits, ie updated v1. +// alignr for 32 & 64 bit elements is only available with AVX512 but +// emulated here. Shift argument is not needed, it's always 1. +// Behaviour is otherwise consistent with Intel alignr intrinsics. #if defined(__SSSE3__) -#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) -#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) - -/* -#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 ) -#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) - -#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 ) -#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 ) - -#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 1 ) -#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 1 ) -*/ +#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) +#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) #else -#define mm128_shufl2r_64( v1, v2 ) \ - _mm_or_si128( _mm_srli_si128( v1, 8 ), \ - _mm_slli_si128( v2, 8 ) ) +#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \ + _mm_srli_si128( v2, 8 ) ) -#define mm128_shufl2l_64( v1, v2 ) \ - _mm_or_si128( _mm_slli_si128( v1, 8 ), \ - _mm_srli_si128( v2, 8 ) ) -/* -#define mm128_shufl2r_32( v1, v2 ) \ - _mm_or_si128( _mm_srli_si128( v1, 4 ), \ - _mm_slli_si128( v2, 12 ) ) +#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \ + _mm_srli_si128( v2, 4 ) ) -#define mm128_shufl2l_32( v1, v2 ) \ - _mm_or_si128( _mm_slli_si128( v1, 4 ), \ - _mm_srli_si128( v2, 12 ) ) - -#define mm128_shufl2r_16( v1, v2 ) \ - _mm_or_si128( _mm_srli_si128( v1, 2 ), \ - _mm_slli_si128( v2, 14 ) ) - -#define mm128_shufl2l_16( v1, v2 ) \ - _mm_or_si128( _mm_slli_si128( v1, 2 ), \ - _mm_srli_si128( v2, 14 ) ) - -#define mm128_shufl2r_8( v1, v2 ) \ - _mm_or_si128( _mm_srli_si128( v1, 1 ), \ - _mm_slli_si128( v2, 15 ) ) - -#define mm128_shufl2l_8( v1, v2 ) \ - _mm_or_si128( _mm_slli_si128( v1, 1 ), \ - _mm_srli_si128( v2, 15 ) ) -*/ #endif // Procedure macros with 2 inputs and 2 outputs, input args are overwritten. @@ -689,50 +657,6 @@ do { \ v1 = t; \ } while(0) -/* -#define mm128_vror256_32( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ - v1 = _mm_alignr_epi8( v2, v1, 4 ); \ - v2 = t; \ -} while(0) - -#define mm128_vrol256_32( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ - v2 = _mm_alignr_epi8( v2, v1, 12 ); \ - v1 = t; \ -} while(0) - -#define mm128_vror256_16( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ - v1 = _mm_alignr_epi8( v2, v1, 2 ); \ - v2 = t; \ -} while(0) - -#define mm128_vrol256_16( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ - v2 = _mm_alignr_epi8( v2, v1, 14 ); \ - v1 = t; \ -} while(0) - -#define mm128_vror256_8( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ - v1 = _mm_alignr_epi8( v2, v1, 1 ); \ - v2 = t; \ -} while(0) - -#define mm128_vrol256_8( v1, v2 ) \ -do { \ - __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ - v2 = _mm_alignr_epi8( v2, v1, 15 ); \ - v1 = t; \ -} while(0) -*/ - #else // SSE2 #define mm128_vror256_64( v1, v2 ) \ @@ -752,61 +676,7 @@ do { \ _mm_srli_si128( v1, 8 ) ); \ v1 = t; \ } while(0) -/* -#define mm128_vror256_32( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ - _mm_slli_si128( v2, 12 ) ); \ - v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \ - _mm_slli_si128( v1, 12 ) ); \ - v1 = t; \ -} while(0) -#define mm128_vrol256_32( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ - _mm_srli_si128( v2, 12 ) ); \ - v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \ - _mm_srli_si128( v1, 12 ) ); \ - v1 = t; \ -} while(0) - -#define mm128_vror256_16( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ - _mm_slli_si128( v2, 14 ) ); \ - v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \ - _mm_slli_si128( v1, 14 ) ); \ - v1 = t; \ -} while(0) - -#define mm128_vrol256_16( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ - _mm_srli_si128( v2, 14 ) ); \ - v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \ - _mm_srli_si128( v1, 14 ) ); \ - v1 = t; \ -} while(0) - -#define mm128_vror256_8( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ - _mm_slli_si128( v2, 15 ) ); \ - v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \ - _mm_slli_si128( v1, 15 ) ); \ - v1 = t; \ -} while(0) - -#define mm128_vrol256_8( v1, v2 ) \ -do { \ - __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ - _mm_srli_si128( v2, 15 ) ); \ - v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \ - _mm_srli_si128( v1, 15 ) ); \ - v1 = t; \ -} while(0) -*/ #endif // SSE4.1 else SSE2 #endif // __SSE2__ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index d4da4f9..8b5ff40 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -15,14 +15,13 @@ // // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit // lanes and data can't cross the 128 bit lane boundary. -// Some usage may have the index vector encoded as if full vector -// shuffles are supported. This has no side effects and would have the same -// results using either version. -// If the need arises and AVX512VL is available, 256 bit full vector shuffles -// can be implemented using the AVX512 zero-mask feature with a NULL mask. -// Using intrinsics it's simple: _mm256_maskz_shuffle_epi8( 0, v, c ) -// With asm it's a bit more complicated with the addition of the mask register -// and zero tag: vpshufb ymm0{k0}{z}, ymm1, ymm2 +// Instructions that can move data across 128 bit lane boundary incur a +// performance penalty over those that can't. +// Some usage of index vectors may be encoded as if full vector shuffles are +// supported. This has no side effects and would have the same results using +// either version. +// If the need arises and AVX512VL is available, 256 bit full vector byte +// shuffles can be implemented using the AVX512 mask feature with a NULL mask. #if defined(__AVX__) @@ -141,7 +140,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Basic operations without SIMD equivalent -// Bitwise not ( ~v ) #if defined(__AVX512VL__) static inline __m256i mm256_not( const __m256i v ) diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 7916bc4..3124587 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -37,13 +37,21 @@ // version of this specific instruction does not. // // New alignr instructions for epi64 and epi32 operate across the entire -// vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes. +// vector but slower than epi8 which continues to be restricted to 128 bit +// lanes. // // "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require // AVX512-VBMI. The same instructions with larger elements don't have this // requirement. "_mm512_permutexvar_epi8" also performs the same operation // as "_mm512_shuffle_epi8" which only requires AVX512-BW. // +// Two coding conventions are used to prevent macro argument side effects: +// - if a macro arg is used in an expression it must be protected by +// parentheses to ensure an expression argument is evaluated first. +// - if an argument is to referenced multiple times a C inline function +// should be used instead of a macro to prevent an expression argument +// from being evaluated multiple times. +// // There are 2 areas where overhead is a major concern: constants and // permutations. // @@ -184,7 +192,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, // Basic operations without SIMD equivalent // Bitwise NOT: ~x -// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) static inline __m512i mm512_not( const __m512i x ) { return _mm512_ternarylogic_epi64( x, x, x, 1 ); } @@ -295,7 +302,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_nand( a, b ) \ _mm512_ternarylogic_epi64( a, b, b, 0xef ) - +/* // Diagonal blending // Blend 8 64 bit elements from 8 vectors #define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \ @@ -313,6 +320,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) _mm512_mask_blend_epi32( 0x3333, \ _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \ _mm512_mask_blend_epi32( 0x1111, v1, v0 ) ) +*/ /* // @@ -374,6 +382,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_ror_32 _mm512_ror_epi32 #define mm512_rol_32 _mm512_rol_epi32 +/* +#if defined(__AVX512VBMI2__) + +// Use C inline function in case arg is coded as an expression. +static inline __m512i mm512_ror_16( __m512i v, int c ) +{ return _mm512_shrdi_epi16( v, v, c ); } + +static inline __m512i mm512_rol_16( __m512i v, int c ) +{ return _mm512_shldi_epi16( v, v, c ); } + +#endif +*/ + // // Reverse byte order of packed elements, vectorized endian conversion. @@ -518,7 +539,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) // // Rotate elements within 256 bit lanes of 512 bit vector. -// 128 bit lane shift is handled by bslli bsrli. // Swap hi & lo 128 bits in each 256 bit lane #define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) @@ -623,22 +643,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) #define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 ) #define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 ) -/* -// 2 input, 1 output -// Concatenate { v1, v2 } then rotate right or left and return the high -// 512 bits, ie rotated v1. -#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 ) -#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 ) - -#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 ) -#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 ) - -#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 ) -#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 ) - -#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 ) -#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 ) -*/ - #endif // AVX512 #endif // SIMD_512_H__