This commit is contained in:
Jay D Dee
2022-12-21 13:09:14 -05:00
parent bd84f199fe
commit da7030faa8
24 changed files with 997 additions and 642 deletions

View File

@@ -1,4 +1,6 @@
These instructions may be out of date, see the Wiki for the latest...
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
1. Requirements:
---------------

View File

@@ -1,6 +1,6 @@
Instructions for compiling cpuminer-opt for Windows.
Thwaw intructions nay be out of date. Please consult the wiki for
These intructions are out of date. Please consult the wiki for
the latest:
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source

View File

@@ -74,53 +74,50 @@ Supported Algorithms
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
axiom Shabal-256 MemoHash
blake Blake-256 (SFR)
blake2b Blake2b 256
blake2s Blake-2 S
blake Blake-256
blake2b Blake2-512
blake2s Blake2-256
blakecoin blake256r8
bmw BMW 256
bmw512 BMW 512
c11 Chaincoin
c11
decred
deep Deepcoin (DCN)
dmd-gr Diamond-Groestl
groestl Groestl coin
hex x16r-hex
hmq1725 Espers
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2h Hppcoin
lyra2h
lyra2re lyra2
lyra2rev2 lyra2v2
lyra2rev3 lyrav2v3
lyra2z
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
m7m Magi (XMG)
minotaur Ringcoin (RNG)
lyra2z330
m7m
minotaur
minotaurx
myr-gr Myriad-Groestl
neoscrypt NeoScrypt(128, 2, 1)
nist5 Nist5
pentablake Pentablake
phi1612 phi
phi2 Luxcoin (LUX)
phi2-lux identical to phi2
pluck Pluck:128 (Supcoin)
phi2
polytimos Ninja
power2b MicroBitcoin (MBC)
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256q Quad SHA-256, Pyrite (PYE)
sha256t Triple SHA-256, Onecoin (OC)
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
shavite3 Shavite3
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -136,17 +133,17 @@ Supported Algorithms
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x12
x13
x13bcd bcd
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x14
x15
x16r
x16rv2
x16rt Gincoin (GIN)
x16rt-veil Veil (VEIL)
x16s Pigeoncoin (PGN)
x16rt
x16rt-veil veil
x16s
x17
x21s
x22i

View File

@@ -73,7 +73,6 @@ third party packages. They often will work and may be used instead of the
included version of the files.
If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT

View File

@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
Change Log
----------
v3.21.0
Added minotaurx algo for stratum only.
Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512.
Other small improvements.
v3.20.3
Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
@@ -98,12 +104,9 @@ v3.19.8
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
url protocol specifier for requesting a secure stratum connection.
The full url, including the protocol, is now displayed in the stratum connect
log and the periodic summary log.
Small optimizations to Cubehash, AVX2 & AVX512.
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
v3.19.7

View File

@@ -327,6 +327,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
case ALGO_MINOTAURX: rc = register_minotaur_algo ( gate ); break;
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;

View File

@@ -115,7 +115,7 @@ void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
@@ -178,7 +178,7 @@ void blake256_16way_close(void *cc, void *dst);
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );

View File

@@ -668,6 +668,258 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
}
// Short cut message expansion when the message data is known to be zero.
// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
{ \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
}
// Message expansion optimized for each round.
#define ROUND256_8WAY_0 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS1 ) ), \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS0 ) ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS3 ) ), \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS2 ) ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS5 ) ), \
_mm256_set1_epi32( CS4 ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS7 ) , \
_mm256_set1_epi32( CS6 ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS9 ) , \
_mm256_set1_epi32( CS8 ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSB ) , \
_mm256_set1_epi32( CSA ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CSD ) , \
_mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSF ) , \
_mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ) ); \
}
#define ROUND256_8WAY_1 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSA ) , \
_mm256_set1_epi32( CSE ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ), \
_mm256_set1_epi32( CS4 ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSF ) , \
_mm256_xor_si256( MF, _mm256_set1_epi32( CS9 ) ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( MD, _mm256_set1_epi32( CS6 ) ), \
_mm256_set1_epi32( CSD ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, \
_mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ), \
_mm256_set1_epi32( CS1 ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS2 ) ), \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS0 ) ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS7 ) , \
_mm256_set1_epi32( CSB ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS3 ) , \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS5 ) ) ); \
}
#define ROUND256_8WAY_2 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS8 ) , \
_mm256_set1_epi32( CSB ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS0 ) , \
_mm256_xor_si256( M0, _mm256_set1_epi32( CSC ) ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS2 ) , \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS5 ) ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( MF, _mm256_set1_epi32( CSD ) ), \
_mm256_xor_si256( MD, _mm256_set1_epi32( CSF ) ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CSE ) , \
_mm256_set1_epi32( CSA ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ), \
_mm256_set1_epi32( CS3 ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS1 ) , \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS7 ) ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS4 ) , \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS9 ) ) ); \
}
#define ROUND256_8WAY_3 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS9 ) , \
_mm256_set1_epi32( CS7 ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS1 ) ), \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS3 ) ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, \
_mm256_xor_si256( MD, _mm256_set1_epi32( CSC ) ), \
_mm256_set1_epi32( CSD ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSE ) , \
_mm256_set1_epi32( CSB ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS6 ) ), \
_mm256_set1_epi32( CS2 ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSA ) , \
_mm256_set1_epi32( CS5 ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS0 ) ), \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS4 ) ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, \
_mm256_xor_si256( MF, _mm256_set1_epi32( CS8 ) ), \
_mm256_set1_epi32( CSF ) ); \
}
#define ROUND256_8WAY_4 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS0 ) , \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS9 ) ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS7 ) , \
_mm256_set1_epi32( CS5 ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS4 ) ), \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS2 ) ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CSF ) , \
_mm256_xor_si256( MF, _mm256_set1_epi32( CSA ) ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS1 ) , \
_mm256_xor_si256( M1, _mm256_set1_epi32( CSE ) ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSC ) , \
_mm256_set1_epi32( CSB ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS8 ) , \
_mm256_set1_epi32( CS6 ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, \
_mm256_xor_si256( M3, _mm256_set1_epi32( CSD ) ), \
_mm256_xor_si256( MD, _mm256_set1_epi32( CS3 ) ) ); \
}
#define ROUND256_8WAY_5 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, \
_mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ), \
_mm256_set1_epi32( CS2 ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSA ) , \
_mm256_set1_epi32( CS6 ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, \
_mm256_xor_si256( M0, _mm256_set1_epi32( CSB ) ), \
_mm256_set1_epi32( CS0 ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, _mm256_set1_epi32( CS3 ) , \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS8 ) ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, \
_mm256_xor_si256( M4, _mm256_set1_epi32( CSD ) ), \
_mm256_xor_si256( MD, _mm256_set1_epi32( CS4 ) ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS5 ) , \
_mm256_set1_epi32( CS7 ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, \
_mm256_xor_si256( MF, _mm256_set1_epi32( CSE ) ), \
_mm256_set1_epi32( CSF ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS9 ) ), \
_mm256_set1_epi32( CS1 ) ); \
}
#define ROUND256_8WAY_6 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS5 ) , \
_mm256_set1_epi32( CSC ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, \
_mm256_xor_si256( M1, _mm256_set1_epi32( CSF ) ), \
_mm256_xor_si256( MF, _mm256_set1_epi32( CS1 ) ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CSD ) , \
_mm256_xor_si256( MD, _mm256_set1_epi32( CSE ) ) );\
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( M4, _mm256_set1_epi32( CSA ) ), \
_mm256_set1_epi32( CS4 ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS7 ) ), \
_mm256_set1_epi32( CS0 ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CS3 ) , \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS6 ) ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS2 ) , \
_mm256_xor_si256( M2, _mm256_set1_epi32( CS9 ) ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CSB ) , \
_mm256_set1_epi32( CS8 ) ); \
}
#define ROUND256_8WAY_7 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, \
_mm256_xor_si256( MD, _mm256_set1_epi32( CSB ) ), \
_mm256_set1_epi32( CSD ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CSE ) , \
_mm256_set1_epi32( CS7 ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS1 ) , \
_mm256_xor_si256( M1, _mm256_set1_epi32( CSC ) ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( M3, _mm256_set1_epi32( CS9 ) ), \
_mm256_set1_epi32( CS3 ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS0 ) , \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS5 ) ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, \
_mm256_xor_si256( MF, _mm256_set1_epi32( CS4 ) ), \
_mm256_xor_si256( M4, _mm256_set1_epi32( CSF ) ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, _mm256_set1_epi32( CS6 ) , \
_mm256_set1_epi32( CS8 ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, \
_mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ), \
_mm256_set1_epi32( CS2 ) ); \
}
#define ROUND256_8WAY_8 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CSF ), \
_mm256_xor_si256( MF, _mm256_set1_epi32( CS6 ) ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS9 ) , \
_mm256_set1_epi32( CSE ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS3 ) , \
_mm256_xor_si256( M3, _mm256_set1_epi32( CSB ) ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( M0, _mm256_set1_epi32( CS8 ) ), \
_mm256_set1_epi32( CS0 ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, _mm256_set1_epi32( CS2 ) , \
_mm256_xor_si256( M2, _mm256_set1_epi32( CSC ) ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, \
_mm256_xor_si256( MD, _mm256_set1_epi32( CS7 ) ), \
_mm256_set1_epi32( CSD ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS4 ) ), \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS1 ) ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, _mm256_set1_epi32( CS5 ) , \
_mm256_set1_epi32( CSA ) ); \
}
#define ROUND256_8WAY_9 \
{ \
G256_8WAY_ALT( V0, V4, V8, VC, _mm256_set1_epi32( CS2 ) , \
_mm256_xor_si256( M2, _mm256_set1_epi32( CSA ) ) ); \
G256_8WAY_ALT( V1, V5, V9, VD, _mm256_set1_epi32( CS4 ) , \
_mm256_xor_si256( M4, _mm256_set1_epi32( CS8 ) ) ); \
G256_8WAY_ALT( V2, V6, VA, VE, _mm256_set1_epi32( CS6 ) , \
_mm256_set1_epi32( CS7 ) ); \
G256_8WAY_ALT( V3, V7, VB, VF, \
_mm256_xor_si256( M1, _mm256_set1_epi32( CS5 ) ), \
_mm256_set1_epi32( CS1 ) ); \
G256_8WAY_ALT( V0, V5, VA, VF, \
_mm256_xor_si256( MF, _mm256_set1_epi32( CSB ) ), \
_mm256_set1_epi32( CSF ) ); \
G256_8WAY_ALT( V1, V6, VB, VC, _mm256_set1_epi32( CSE ) , \
_mm256_set1_epi32( CS9 ) ); \
G256_8WAY_ALT( V2, V7, V8, VD, \
_mm256_xor_si256( M3, _mm256_set1_epi32( CSC ) ), \
_mm256_set1_epi32( CS3 ) ); \
G256_8WAY_ALT( V3, V4, V9, VE, \
_mm256_xor_si256( MD, _mm256_set1_epi32( CS0 ) ), \
_mm256_xor_si256( M0, _mm256_set1_epi32( CSD ) ) ); \
}
#define DECL_STATE32_8WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
sph_u32 T0, T1;
@@ -834,9 +1086,9 @@ do { \
}
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
void *data )
{
const __m256i *M = (const __m256i*)data;
__m256i *M = (__m256i*)data;
__m256i *V = (__m256i*)midstate;
const __m256i *H = (const __m256i*)midhash;
@@ -857,6 +1109,17 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
V[14] = m256_const1_32( CS6 );
V[15] = m256_const1_32( CS7 );
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
// M[ 5:12, 14 ] are always zero and not needed or used.
// M[ 4], M[ 13], M[15] are constant and are initialized here.
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
M[ 4] = m256_const1_32( 0x80000000 );
M[13] = m256_one_32;
M[15] = m256_const1_32( 80*8 );
M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );
// G0
GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
@@ -868,21 +1131,45 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G2
// GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
_mm256_xor_si256( _mm256_set1_epi32( CS5 ), M[ 4] ) );
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
V[10] = _mm256_add_epi32( V[10], V[14] );
V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
_mm256_set1_epi32( CS4 ) );
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 8 );
V[10] = _mm256_add_epi32( V[10], V[14] );
V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 7 );
// G3
// GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
_mm256_set1_epi32( CS7 ) );
V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
V[11] = _mm256_add_epi32( V[11], V[15] );
V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
_mm256_set1_epi32( CS6 ) );
V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 8 );
V[11] = _mm256_add_epi32( V[11], V[15] );
V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 7 );
// G4
V[ 0] = _mm256_add_epi32( V[ 0],
_mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
V[ 0] = _mm256_add_epi32( V[ 0], _mm256_set1_epi32( CS9 ) );
// G5
// GS_8WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
// G6
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
_mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
_mm256_set1_epi32( CSD ) );
// G7
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
_mm256_set1_epi32( CSF ) );
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
V[ 3] = _mm256_add_epi32( V[ 3],
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
@@ -893,47 +1180,40 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
{
__m256i *H = (__m256i*)final_hash;
const __m256i *h = (const __m256i*)midhash;
const __m256i *v= (const __m256i*)midstate;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
__m256i M0, M1, M2, M3, M4, MD, MF;
__m256i MDxorCSC;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
V0 = _mm256_load_si256( (__m256i*)midstate + 0 );
V1 = _mm256_load_si256( (__m256i*)midstate + 1 );
V2 = _mm256_load_si256( (__m256i*)midstate + 2 );
V3 = _mm256_load_si256( (__m256i*)midstate + 3 );
V4 = _mm256_load_si256( (__m256i*)midstate + 4 );
V5 = _mm256_load_si256( (__m256i*)midstate + 5 );
V6 = _mm256_load_si256( (__m256i*)midstate + 6 );
V7 = _mm256_load_si256( (__m256i*)midstate + 7 );
V8 = _mm256_load_si256( (__m256i*)midstate + 8 );
V9 = _mm256_load_si256( (__m256i*)midstate + 9 );
VA = _mm256_load_si256( (__m256i*)midstate + 10 );
VB = _mm256_load_si256( (__m256i*)midstate + 11 );
VC = _mm256_load_si256( (__m256i*)midstate + 12 );
VD = _mm256_load_si256( (__m256i*)midstate + 13 );
VE = _mm256_load_si256( (__m256i*)midstate + 14 );
VF = _mm256_load_si256( (__m256i*)midstate + 15 );
M0 = casti_m256i( data, 0 );
M1 = casti_m256i( data, 1 );
M2 = casti_m256i( data, 2 );
M3 = casti_m256i( data, 3 );
M4 = casti_m256i( data, 4 );
M5 = casti_m256i( data, 5 );
M6 = casti_m256i( data, 6 );
M7 = casti_m256i( data, 7 );
M8 = casti_m256i( data, 8 );
M9 = casti_m256i( data, 9 );
MA = casti_m256i( data, 10 );
MB = casti_m256i( data, 11 );
MC = casti_m256i( data, 12 );
MD = casti_m256i( data, 13 );
ME = casti_m256i( data, 14 );
MF = casti_m256i( data, 15 );
M0 = _mm256_load_si256( (__m256i*)data + 0 );
M1 = _mm256_load_si256( (__m256i*)data + 1 );
M2 = _mm256_load_si256( (__m256i*)data + 2 );
M3 = _mm256_load_si256( (__m256i*)data + 3 );
M4 = _mm256_load_si256( (__m256i*)data + 4 );
// M5 to MC & ME zero padding & optimised out.
MD = _mm256_load_si256( (__m256i*)data + 13 );
MF = _mm256_load_si256( (__m256i*)data + 15 );
// precalculated MD^CSC, used in round0 G6.
MDxorCSC = _mm256_load_si256( (__m256i*)data + 5 );
// Finish round 0
// Finish round 0 with nonce in M3
// G1
V1 = _mm256_add_epi32( V1,
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
@@ -947,20 +1227,29 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
_mm256_set1_epi32( CS8 ) ) );
VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
// G5
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
_mm256_set1_epi32( CSB ) );
VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
VB = _mm256_add_epi32( VB, VC );
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
_mm256_set1_epi32( CSA ) );
VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 8 );
VB = _mm256_add_epi32( VB, VC );
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );
// G6
VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
@@ -974,19 +1263,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
ROUND_S_8WAY( 4 );
ROUND_S_8WAY( 5 );
ROUND_S_8WAY( 6 );
ROUND_S_8WAY( 7 );
ROUND_S_8WAY( 8 );
ROUND_S_8WAY( 9 );
ROUND_S_8WAY( 0 );
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
ROUND256_8WAY_1;
ROUND256_8WAY_2;
ROUND256_8WAY_3;
ROUND256_8WAY_4;
ROUND256_8WAY_5;
ROUND256_8WAY_6;
ROUND256_8WAY_7;
ROUND256_8WAY_8;
ROUND256_8WAY_9;
ROUND256_8WAY_0;
ROUND256_8WAY_1;
ROUND256_8WAY_2;
ROUND256_8WAY_3;
const __m256i shuf_bswap32 =
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
@@ -1010,6 +1299,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
//
// Blake-256 16 way AVX512
// Generic with full inline message expansion
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
@@ -1036,6 +1326,257 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
}
// Short cut message expansion when the message data is known to be zero.
// M[ 5:12, 14 ] are zero padded for the second block of 80 byte data.
#define G256_16WAY_ALT( a, b, c, d, m0, m1 ) \
{ \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m0 ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi32( c, d ); \
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m1 ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
c = _mm512_add_epi32( c, d ); \
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
}
// Message expansion optimized for each round.
#define ROUND256_16WAY_0 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS1 ) ), \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS0 ) ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS3 ) ), \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS2 ) ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS5 ) ), \
_mm512_set1_epi32( CS4 ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS7 ) , \
_mm512_set1_epi32( CS6 ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS9 ) , \
_mm512_set1_epi32( CS8 ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSB ) , \
_mm512_set1_epi32( CSA ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CSD ) , \
_mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSF ) , \
_mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ) ); \
}
#define ROUND256_16WAY_1 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSA ) , \
_mm512_set1_epi32( CSE ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ), \
_mm512_set1_epi32( CS4 ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSF ) , \
_mm512_xor_si512( MF, _mm512_set1_epi32( CS9 ) ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( MD, _mm512_set1_epi32( CS6 ) ), \
_mm512_set1_epi32( CSD ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, \
_mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ), \
_mm512_set1_epi32( CS1 ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS2 ) ), \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS0 ) ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS7 ) , \
_mm512_set1_epi32( CSB ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS3 ) , \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS5 ) ) ); \
}
#define ROUND256_16WAY_2 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS8 ) , \
_mm512_set1_epi32( CSB ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS0 ) , \
_mm512_xor_si512( M0, _mm512_set1_epi32( CSC ) ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS2 ) , \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS5 ) ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( MF, _mm512_set1_epi32( CSD ) ), \
_mm512_xor_si512( MD, _mm512_set1_epi32( CSF ) ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CSE ) , \
_mm512_set1_epi32( CSA ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ), \
_mm512_set1_epi32( CS3 ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS1 ) , \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS7 ) ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS4 ) , \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS9 ) ) ); \
}
#define ROUND256_16WAY_3 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS9 ) , \
_mm512_set1_epi32( CS7 ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS1 ) ), \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS3 ) ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, \
_mm512_xor_si512( MD, _mm512_set1_epi32( CSC ) ), \
_mm512_set1_epi32( CSD ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSE ) , \
_mm512_set1_epi32( CSB ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS6 ) ), \
_mm512_set1_epi32( CS2 ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSA ) , \
_mm512_set1_epi32( CS5 ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS0 ) ), \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS4 ) ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, \
_mm512_xor_si512( MF, _mm512_set1_epi32( CS8 ) ), \
_mm512_set1_epi32( CSF ) ); \
}
#define ROUND256_16WAY_4 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS0 ) , \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS9 ) ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS7 ) , \
_mm512_set1_epi32( CS5 ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS4 ) ), \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS2 ) ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CSF ) , \
_mm512_xor_si512( MF, _mm512_set1_epi32( CSA ) ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS1 ) , \
_mm512_xor_si512( M1, _mm512_set1_epi32( CSE ) ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSC ) , \
_mm512_set1_epi32( CSB ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS8 ) , \
_mm512_set1_epi32( CS6 ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, \
_mm512_xor_si512( M3, _mm512_set1_epi32( CSD ) ), \
_mm512_xor_si512( MD, _mm512_set1_epi32( CS3 ) ) ); \
}
#define ROUND256_16WAY_5 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, \
_mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ), \
_mm512_set1_epi32( CS2 ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSA ) , \
_mm512_set1_epi32( CS6 ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, \
_mm512_xor_si512( M0, _mm512_set1_epi32( CSB ) ), \
_mm512_set1_epi32( CS0 ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, _mm512_set1_epi32( CS3 ) , \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS8 ) ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, \
_mm512_xor_si512( M4, _mm512_set1_epi32( CSD ) ), \
_mm512_xor_si512( MD, _mm512_set1_epi32( CS4 ) ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS5 ) , \
_mm512_set1_epi32( CS7 ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, \
_mm512_xor_si512( MF, _mm512_set1_epi32( CSE ) ), \
_mm512_set1_epi32( CSF ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS9 ) ), \
_mm512_set1_epi32( CS1 ) ); \
}
#define ROUND256_16WAY_6 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS5 ) , \
_mm512_set1_epi32( CSC ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, \
_mm512_xor_si512( M1, _mm512_set1_epi32( CSF ) ), \
_mm512_xor_si512( MF, _mm512_set1_epi32( CS1 ) ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CSD ) , \
_mm512_xor_si512( MD, _mm512_set1_epi32( CSE ) ) );\
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( M4, _mm512_set1_epi32( CSA ) ), \
_mm512_set1_epi32( CS4 ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS7 ) ), \
_mm512_set1_epi32( CS0 ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CS3 ) , \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS6 ) ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS2 ) , \
_mm512_xor_si512( M2, _mm512_set1_epi32( CS9 ) ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CSB ) , \
_mm512_set1_epi32( CS8 ) ); \
}
#define ROUND256_16WAY_7 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, \
_mm512_xor_si512( MD, _mm512_set1_epi32( CSB ) ), \
_mm512_set1_epi32( CSD ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CSE ) , \
_mm512_set1_epi32( CS7 ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS1 ) , \
_mm512_xor_si512( M1, _mm512_set1_epi32( CSC ) ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( M3, _mm512_set1_epi32( CS9 ) ), \
_mm512_set1_epi32( CS3 ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS0 ) , \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS5 ) ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, \
_mm512_xor_si512( MF, _mm512_set1_epi32( CS4 ) ), \
_mm512_xor_si512( M4, _mm512_set1_epi32( CSF ) ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, _mm512_set1_epi32( CS6 ) , \
_mm512_set1_epi32( CS8 ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, \
_mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ), \
_mm512_set1_epi32( CS2 ) ); \
}
#define ROUND256_16WAY_8 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CSF ), \
_mm512_xor_si512( MF, _mm512_set1_epi32( CS6 ) ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS9 ) , \
_mm512_set1_epi32( CSE ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS3 ) , \
_mm512_xor_si512( M3, _mm512_set1_epi32( CSB ) ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( M0, _mm512_set1_epi32( CS8 ) ), \
_mm512_set1_epi32( CS0 ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, _mm512_set1_epi32( CS2 ) , \
_mm512_xor_si512( M2, _mm512_set1_epi32( CSC ) ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, \
_mm512_xor_si512( MD, _mm512_set1_epi32( CS7 ) ), \
_mm512_set1_epi32( CSD ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS4 ) ), \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS1 ) ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, _mm512_set1_epi32( CS5 ) , \
_mm512_set1_epi32( CSA ) ); \
}
#define ROUND256_16WAY_9 \
{ \
G256_16WAY_ALT( V0, V4, V8, VC, _mm512_set1_epi32( CS2 ) , \
_mm512_xor_si512( M2, _mm512_set1_epi32( CSA ) ) ); \
G256_16WAY_ALT( V1, V5, V9, VD, _mm512_set1_epi32( CS4 ) , \
_mm512_xor_si512( M4, _mm512_set1_epi32( CS8 ) ) ); \
G256_16WAY_ALT( V2, V6, VA, VE, _mm512_set1_epi32( CS6 ) , \
_mm512_set1_epi32( CS7 ) ); \
G256_16WAY_ALT( V3, V7, VB, VF, \
_mm512_xor_si512( M1, _mm512_set1_epi32( CS5 ) ), \
_mm512_set1_epi32( CS1 ) ); \
G256_16WAY_ALT( V0, V5, VA, VF, \
_mm512_xor_si512( MF, _mm512_set1_epi32( CSB ) ), \
_mm512_set1_epi32( CSF ) ); \
G256_16WAY_ALT( V1, V6, VB, VC, _mm512_set1_epi32( CSE ) , \
_mm512_set1_epi32( CS9 ) ); \
G256_16WAY_ALT( V2, V7, V8, VD, \
_mm512_xor_si512( M3, _mm512_set1_epi32( CSC ) ), \
_mm512_set1_epi32( CS3 ) ); \
G256_16WAY_ALT( V3, V4, V9, VE, \
_mm512_xor_si512( MD, _mm512_set1_epi32( CS0 ) ), \
_mm512_xor_si512( M0, _mm512_set1_epi32( CSD ) ) ); \
}
#define DECL_STATE32_16WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
sph_u32 T0, T1;
@@ -1208,9 +1749,9 @@ do { \
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
void *data )
{
const __m512i *M = (const __m512i*)data;
__m512i *M = (__m512i*)data;
__m512i *V = (__m512i*)midstate;
const __m512i *H = (const __m512i*)midhash;
@@ -1231,10 +1772,21 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
V[14] = m512_const1_32( CS6 );
V[15] = m512_const1_32( CS7 );
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
// M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
// M[ 4], M[ 13], M[15] are constant and are initialized here.
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
M[ 4] = m512_const1_32( 0x80000000 );
M[13] = m512_one_32;
M[15] = m512_const1_32( 80*8 );
M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );
// G0
GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
// G1, nonce is in M[3]
// G1
// GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
_mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
@@ -1243,14 +1795,35 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G2
// GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
_mm512_xor_si512( _mm512_set1_epi32( CS5 ), M[ 4] ) );
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 16 );
V[10] = _mm512_add_epi32( V[10], V[14] );
V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 12 );
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 6] ),
_mm512_set1_epi32( CS4 ) );
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 2] ), 8 );
V[10] = _mm512_add_epi32( V[10], V[14] ); \
V[ 6] = mm512_ror_32( _mm512_xor_si512( V[ 6], V[10] ), 7 );
// G3
// GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
_mm512_set1_epi32( CS7 ) );
V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 16 );
V[11] = _mm512_add_epi32( V[11], V[15] );
V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 12 );
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 7] ),
_mm512_set1_epi32( CS6 ) );
V[15] = mm512_ror_32( _mm512_xor_si512( V[15], V[ 3] ), 8 );
V[11] = _mm512_add_epi32( V[11], V[15] ); \
V[ 7] = mm512_ror_32( _mm512_xor_si512( V[ 7], V[11] ), 7 );
// G4
// GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
V[ 0] = _mm512_add_epi32( V[ 0],
_mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) );
V[ 0] = _mm512_add_epi32( V[ 0], _mm512_set1_epi32( CS9 ) );
// G5
// GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
@@ -1258,11 +1831,11 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
// G6
// GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
_mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
_mm512_set1_epi32( CSD ) );
// G7
// GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
_mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
_mm512_set1_epi32( CSF ) );
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
V[ 3] = _mm512_add_epi32( V[ 3],
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
@@ -1273,45 +1846,38 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
{
__m512i *H = (__m512i*)final_hash;
const __m512i *h = (const __m512i*)midhash;
const __m512i *v= (const __m512i*)midstate;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i M0, M1, M2, M3, M4, MD, MF;
__m512i MDxorCSC;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
V0 = _mm512_load_si512( (__m512i*)midstate + 0 );
V1 = _mm512_load_si512( (__m512i*)midstate + 1 );
V2 = _mm512_load_si512( (__m512i*)midstate + 2 );
V3 = _mm512_load_si512( (__m512i*)midstate + 3 );
V4 = _mm512_load_si512( (__m512i*)midstate + 4 );
V5 = _mm512_load_si512( (__m512i*)midstate + 5 );
V6 = _mm512_load_si512( (__m512i*)midstate + 6 );
V7 = _mm512_load_si512( (__m512i*)midstate + 7 );
V8 = _mm512_load_si512( (__m512i*)midstate + 8 );
V9 = _mm512_load_si512( (__m512i*)midstate + 9 );
VA = _mm512_load_si512( (__m512i*)midstate + 10 );
VB = _mm512_load_si512( (__m512i*)midstate + 11 );
VC = _mm512_load_si512( (__m512i*)midstate + 12 );
VD = _mm512_load_si512( (__m512i*)midstate + 13 );
VE = _mm512_load_si512( (__m512i*)midstate + 14 );
VF = _mm512_load_si512( (__m512i*)midstate + 15 );
M0 = casti_m512i( data, 0 );
M1 = casti_m512i( data, 1 );
M2 = casti_m512i( data, 2 );
M3 = casti_m512i( data, 3 );
M4 = casti_m512i( data, 4 );
M5 = casti_m512i( data, 5 );
M6 = casti_m512i( data, 6 );
M7 = casti_m512i( data, 7 );
M8 = casti_m512i( data, 8 );
M9 = casti_m512i( data, 9 );
MA = casti_m512i( data, 10 );
MB = casti_m512i( data, 11 );
MC = casti_m512i( data, 12 );
MD = casti_m512i( data, 13 );
ME = casti_m512i( data, 14 );
MF = casti_m512i( data, 15 );
M0 = _mm512_load_si512( (__m512i*)data + 0 );
M1 = _mm512_load_si512( (__m512i*)data + 1 );
M2 = _mm512_load_si512( (__m512i*)data + 2 );
M3 = _mm512_load_si512( (__m512i*)data + 3 );
M4 = _mm512_load_si512( (__m512i*)data + 4 );
// M5 to MC & ME are zero padding and optimised out
MD = _mm512_load_si512( (__m512i*)data + 13 );
MF = _mm512_load_si512( (__m512i*)data + 15 );
// cache for precalculated MD^CSC, used in round0 G6.
MDxorCSC = _mm512_load_si512( (__m512i*)data + 5 );
// Finish round 0 with the nonce (M3) now available
// G0
@@ -1336,21 +1902,30 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
_mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
_mm512_set1_epi32( CS8 ) ) );
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
// G5
GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
_mm512_set1_epi32( CSB ) );
VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 16 );
VB = _mm512_add_epi32( VB, VC );
V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 12 );
V1 = _mm512_add_epi32( _mm512_add_epi32( V1, V6 ),
_mm512_set1_epi32( CSA ) );
VC = mm512_ror_32( _mm512_xor_si512( VC, V1 ), 8 );
VB = _mm512_add_epi32( VB, VC );
V6 = mm512_ror_32( _mm512_xor_si512( V6, VB ), 7 );
// G6
// GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
_mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
V2 = _mm512_add_epi32( V2, _mm512_add_epi32( V7, MDxorCSC ) );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
@@ -1364,20 +1939,20 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
V9 = _mm512_add_epi32( V9, VE );
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
ROUND_S_16WAY( 4 );
ROUND_S_16WAY( 5 );
ROUND_S_16WAY( 6 );
ROUND_S_16WAY( 7 );
ROUND_S_16WAY( 8 );
ROUND_S_16WAY( 9 );
ROUND_S_16WAY( 0 );
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
// Remaining rounds, optimised
ROUND256_16WAY_1;
ROUND256_16WAY_2;
ROUND256_16WAY_3;
ROUND256_16WAY_4;
ROUND256_16WAY_5;
ROUND256_16WAY_6;
ROUND256_16WAY_7;
ROUND256_16WAY_8;
ROUND256_16WAY_9;
ROUND256_16WAY_0;
ROUND256_16WAY_1;
ROUND256_16WAY_2;
ROUND256_16WAY_3;
// Byte swap final hash
const __m512i shuf_bswap32 =

View File

@@ -103,16 +103,16 @@
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_shufl2r_64( V[2], V[3] ); \
V3 = mm128_shufl2r_64( V[3], V[2] ); \
V6 = mm128_shufl2l_64( V[6], V[7] ); \
V7 = mm128_shufl2l_64( V[7], V[6] ); \
V2 = mm128_alignr_64( V[3], V[2] ); \
V3 = mm128_alignr_64( V[2], V[3] ); \
V6 = mm128_alignr_64( V[6], V[7] ); \
V7 = mm128_alignr_64( V[7], V[6] ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_shufl2l_64( V2, V3 ); \
V[3] = mm128_shufl2l_64( V3, V2 ); \
V[6] = mm128_shufl2r_64( V6, V7 ); \
V[7] = mm128_shufl2r_64( V7, V6 ); \
V[2] = mm128_alignr_64( V2, V3 ); \
V[3] = mm128_alignr_64( V3, V2 ); \
V[6] = mm128_alignr_64( V7, V6 ); \
V[7] = mm128_alignr_64( V6, V7 ); \
}
#else

View File

@@ -49,12 +49,11 @@ extern "C"{
#define Sb_8W(x0, x1, x2, x3, c) \
do { \
__m512i cc = _mm512_set1_epi64( c ); \
x3 = mm512_not( x3 ); \
const __m512i cc = _mm512_set1_epi64( c ); \
x0 = mm512_xorandnot( x0, x2, cc ); \
tmp = mm512_xorand( cc, x0, x1 ); \
x0 = mm512_xorand( x0, x2, x3 ); \
x3 = mm512_xorandnot( x3, x1, x2 ); \
x0 = mm512_xorandnot( x0, x3, x2 ); \
x3 = _mm512_ternarylogic_epi64( x3, x1, x2, 0x2d ); /* ~x3 ^ (~x1 & x2) */\
x1 = mm512_xorand( x1, x0, x2 ); \
x2 = mm512_xorandnot( x2, x3, x0 ); \
x0 = mm512_xoror( x0, x1, x3 ); \
@@ -79,7 +78,7 @@ do { \
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set1_epi64x( c ); \
const __m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \

View File

@@ -23,13 +23,26 @@
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#if defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#else
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -60,13 +73,13 @@
t = _mm_load_si128(&a0);\
a0 = _mm_or_si128(a0,a1);\
a2 = _mm_xor_si128(a2,a3);\
a1 = _mm_andnot_si128(a1,ALLONE);\
a1 = mm128_not( a1 );\
a0 = _mm_xor_si128(a0,a3);\
a3 = _mm_and_si128(a3,t);\
a1 = _mm_xor_si128(a1,a3);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a0);\
a0 = _mm_andnot_si128(a0,ALLONE);\
a0 = mm128_not( a0 );\
a2 = _mm_xor_si128(a2,a1);\
a1 = _mm_or_si128(a1,a3);\
t = _mm_xor_si128(t,a1);\
@@ -242,17 +255,18 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
__m128i CNS128[32];
__m128i ALLONE;
#if !defined(__SSE4_1__)
__m128i MASK;
#endif
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
{
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
/* set all bits to '1' */
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
@@ -352,10 +366,10 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
/* set all bits to '1' */
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );

View File

@@ -230,25 +230,13 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces, add padding.
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -425,24 +413,12 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

View File

@@ -120,25 +120,13 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
@@ -240,24 +228,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

View File

@@ -711,8 +711,11 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
{
__m256i A, B, C, D, E, F, G, H;
X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
// Except X[ 9] which is part of W[ 0] from the third group.
X[ 0] = _mm256_add_epi32( SSG2_0x( W[ 1] ), W[ 0] );
X[ 1] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( W[15] ),
SSG2_0x( W[ 2] ) ), W[ 1] );
X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ),
W[ 2] );
X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ),
@@ -725,16 +728,12 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
W[ 6] );
X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ),
W[ 7] );
X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ),
W[ 8] );
X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] );
X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] );
X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] );
X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] );
X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] );
X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] );
X[ 8] = _mm256_add_epi32( X[ 1], W[ 8] );
X[14] = SSG2_0x( W[15] );
X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] );
X[ 9] = _mm256_add_epi32( SSG2_0x( X[ 1] ), X[ 0] );
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in + 1 );
C = _mm256_load_si256( state_in + 2 );
@@ -779,10 +778,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
// SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
// SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
// SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( G, H );
#endif
@@ -810,23 +805,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) );
W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) );
W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) );
W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ),
W[ 2] ) );
W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ),
W[ 3] ) );
W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ),
W[ 4] ) );
W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ),
W[ 5] ) );
W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ),
W[ 6] ) );
W[ 9] = _mm256_add_epi32( SSG2_1x( W[ 7] ), W[ 2] );
W[10] = _mm256_add_epi32( SSG2_1x( W[ 8] ), W[ 3] );
W[11] = _mm256_add_epi32( SSG2_1x( W[ 9] ), W[ 4] );
W[12] = _mm256_add_epi32( SSG2_1x( W[10] ), W[ 5] );
W[13] = _mm256_add_epi32( SSG2_1x( W[11] ), W[ 6] );
W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ),
W[ 7] ) );
W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ),
W[ 8] ) );
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x8_MSG_EXPANSION( W );
W[ 0] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[14] ),
W[ 9] ) );
W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] );
W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] );
W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256x8_MSG_EXPANSION( W );
SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1201,9 +1209,13 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
{
__m512i A, B, C, D, E, F, G, H;
// precalculate constant part msg expansion for second iteration.
X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
// X is pre-expanded constant part of msg for second group, rounds 16 to 31.
// W[9:14] are zero, therefore X[9:13] are also zero and not needed.
// Except X[ 9] which is used to pre-expand part of W[ 0] from the third
// group, rounds 32 to 48.
X[ 0] = _mm512_add_epi32( SSG2_0x16( W[ 1] ), W[ 0] );
X[ 1] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( W[15] ),
SSG2_0x16( W[ 2] ) ), W[ 1] );
X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ),
W[ 2] );
X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ),
@@ -1216,16 +1228,12 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
W[ 6] );
X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ),
W[ 7] );
X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ),
W[ 8] );
X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] );
X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] );
X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] );
X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] );
X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] );
X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] );
X[ 8] = _mm512_add_epi32( X[ 1], W[ 8] );
X[14] = SSG2_0x16( W[15] );
X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] );
X[ 9] = _mm512_add_epi32( SSG2_0x16( X[ 1] ), X[ 0] );
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in + 1 );
C = _mm512_load_si512( state_in + 2 );
@@ -1280,7 +1288,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
// update precalculated msg expansion with new nonce: W[3].
// inject nonce, W[3], to complete msg expansion.
W[ 0] = X[ 0];
W[ 1] = X[ 1];
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1290,23 +1298,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) );
W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) );
W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) );
W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ),
W[ 2] ) );
W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ),
W[ 3] ) );
W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ),
W[ 4] ) );
W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ),
W[ 5] ) );
W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ),
W[ 6] ) );
W[ 9] = _mm512_add_epi32( SSG2_1x16( W[ 7] ), W[ 2] );
W[10] = _mm512_add_epi32( SSG2_1x16( W[ 8] ), W[ 3] );
W[11] = _mm512_add_epi32( SSG2_1x16( W[ 9] ), W[ 4] );
W[12] = _mm512_add_epi32( SSG2_1x16( W[10] ), W[ 5] );
W[13] = _mm512_add_epi32( SSG2_1x16( W[11] ), W[ 6] );
W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ),
W[ 7] ) );
W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ),
W[ 8] ) );
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x16_MSG_EXPANSION( W );
W[ 0] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[14] ),
W[ 9] ) );
W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] );
W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] );
W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256x16_MSG_EXPANSION( W );
SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -1336,8 +1357,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
{
__m512i A, B, C, D, E, F, G, H;
__m512i W[16]; memcpy_512( W, data, 16 );
// Value for H at round 60, before adding K, to produce valid final hash
//where H == 0.
// Value for H at round 60, before adding K, needed to produce valid final
// hash where H == 0.
// H_ = -( H256[7] + K256[60] );
const __m512i H_ = m512_const1_32( 0x136032ED );

View File

@@ -383,11 +383,17 @@ static const m512_v16 FFT256_Twiddle4w[] =
#define shufxor4w(x,s) _mm512_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
_mm512_srai_epi16( x, 8 ) )
/*
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S4w(x)\
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
m512_const1_64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
@@ -431,10 +437,6 @@ void fft64_4way( void *a )
// Unrolled decimation in frequency (DIF) radix-2 NTT.
// Output data is in revbin_permuted order.
static const int w[] = {0, 2, 4, 6};
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
// targetted
#define BUTTERFLY_0( i,j ) \
do { \
@@ -443,25 +445,25 @@ do { \
X(i) = _mm512_sub_epi16( X(i), v ); \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w ) \
do { \
__m512i v = X(j); \
X(j) = _mm512_add_epi16( X(i), X(j) ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w[n] ); \
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
} while(0)
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE( 2 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 1 );
@@ -501,12 +503,11 @@ do { \
// Transpose the FFT state with a revbin order permutation
// on the rows and the column.
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
#define INTERLEAVE( i, j ) \
do { \
__m512i t1= X(i); \
__m512i t2= X(j); \
X(i) = _mm512_unpacklo_epi16( t1, t2 ); \
X(j) = _mm512_unpackhi_epi16( t1, t2 ); \
__m512i u = X(j); \
X(j) = _mm512_unpackhi_epi16( X(i), X(j) ); \
X(i) = _mm512_unpacklo_epi16( X(i), u ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -534,10 +535,10 @@ do { \
} while(0)
#define BUTTERFLY_N( i,j,n ) \
#define BUTTERFLY_N( i, j, w ) \
do { \
__m512i u = X(j); \
X(i) = _mm512_slli_epi16( X(i), w[n] ); \
X(i) = _mm512_slli_epi16( X(i), w ); \
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
X(i) = _mm512_add_epi16( u, X(i) ); \
} while(0)
@@ -558,15 +559,15 @@ do { \
BUTTERFLY_0( 0, 2 );
BUTTERFLY_0( 4, 6 );
BUTTERFLY_N( 1, 3, 2 );
BUTTERFLY_N( 5, 7, 2 );
BUTTERFLY_N( 1, 3, 4 );
BUTTERFLY_N( 5, 7, 4 );
DO_REDUCE( 3 );
BUTTERFLY_0( 0, 4 );
BUTTERFLY_N( 1, 5, 1 );
BUTTERFLY_N( 2, 6, 2 );
BUTTERFLY_N( 3, 7, 3 );
BUTTERFLY_N( 1, 5, 2 );
BUTTERFLY_N( 2, 6, 4 );
BUTTERFLY_N( 3, 7, 6 );
DO_REDUCE_FULL_S4w( 0 );
DO_REDUCE_FULL_S4w( 1 );
@@ -599,7 +600,6 @@ void fft128_4way( void *a )
// Temp space to help for interleaving in the end
__m512i B[8];
__m512i *A = (__m512i*) a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
/* Size-2 butterflies */
for ( i = 0; i<8; i++ )
@@ -633,7 +633,6 @@ void fft128_4way_msg( uint16_t *a, const uint8_t *x, int final )
__m512i *X = (__m512i*)x;
__m512i *A = (__m512i*)a;
// __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
#define UNPACK( i ) \
do { \
@@ -686,7 +685,6 @@ void fft256_4way_msg( uint16_t *a, const uint8_t *x, int final )
__m512i *X = (__m512i*)x;
__m512i *A = (__m512i*)a;
// __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
#define UNPACK( i ) \
do { \
@@ -776,109 +774,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
// We split the round function in two halfes
// so as to insert some independent computations in between
// generic
#if 0
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = shufxor( a##l, 1 ); \
d##h = shufxor( a##h, 1 ); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = shufxor( a##h, 2 ); \
d##h = shufxor( a##l, 2 ); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = shufxor( a##l, 2 ); \
d##h = shufxor( a##h, 2 ); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = shufxor( a##l, 3 ); \
d##h = shufxor( a##h, 3 ); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = shufxor( a##h, 1 ); \
d##h = shufxor( a##l, 1 ); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = shufxor( a##h, 3 ); \
d##h = shufxor( a##l, 3 ); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#endif
// targetted
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \

View File

@@ -18,6 +18,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/yespower/yespower.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -31,6 +32,9 @@
// Config
#define MINOTAUR_ALGO_COUNT 16
static const yespower_params_t minotaurx_yespower_params =
{ YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17 };
typedef struct TortureNode TortureNode;
typedef struct TortureGarden TortureGarden;
@@ -59,20 +63,22 @@ struct TortureGarden
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha512;
struct TortureNode {
struct TortureNode
{
unsigned int algo;
TortureNode *child[2];
} nodes[22];
} __attribute__ ((aligned (64)));
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
static void get_hash( void *output, const void *input, TortureGarden *garden,
unsigned int algo )
static int get_hash( void *output, const void *input, TortureGarden *garden,
unsigned int algo, int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;
switch (algo) {
switch ( algo )
{
case 0:
sph_blake512_init(&garden->blake);
sph_blake512(&garden->blake, input, 64);
@@ -164,9 +170,13 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
sph_whirlpool(&garden->whirlpool, input, 64);
sph_whirlpool_close(&garden->whirlpool, hash);
break;
case 16: // minotaurx only, yespower hardcoded for last node
rc = yespower_tls( input, 64, &minotaurx_yespower_params,
(yespower_binary_t*)hash, thr_id );
}
memcpy(output, hash, 64);
return rc;
}
static __thread TortureGarden garden;
@@ -219,7 +229,6 @@ bool initialize_torture_garden()
garden.nodes[20].child[1] = &garden.nodes[21];
garden.nodes[21].child[0] = NULL;
garden.nodes[21].child[1] = NULL;
return true;
}
@@ -227,34 +236,41 @@ bool initialize_torture_garden()
int minotaur_hash( void *output, const void *input, int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;
// Find initial sha512 hash
sph_sha512_init( &garden.sha512 );
sph_sha512( &garden.sha512, input, 80 );
sph_sha512_close( &garden.sha512, hash );
if ( opt_algo != ALGO_MINOTAURX )
{
// algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
// if Hamsi is needed but only the first and last functions are
// currently known. Abort if either is Hamsi.
if ( ( ( hash[ 0] % MINOTAUR_ALGO_COUNT ) == 6 )
|| ( ( hash[21] % MINOTAUR_ALGO_COUNT ) == 6 ) )
return 0;
}
// Assign algos to torture garden nodes based on initial hash
for ( int i = 0; i < 22; i++ )
garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;
// MinotaurX override algo for last node with yespower
if ( opt_algo == ALGO_MINOTAURX )
garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
// Send the initial hash through the torture garden
TortureNode *node = &garden.nodes[0];
while ( node )
while ( rc && node )
{
get_hash( hash, hash, &garden, node->algo );
rc = get_hash( hash, hash, &garden, node->algo, thr_id );
node = node->child[ hash[63] & 1 ];
}
memcpy( output, hash, 32 );
return 1;
return rc;
}
int scanhash_minotaur( struct work *work, uint32_t max_nonce,
@@ -291,12 +307,14 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
return 0;
}
// hash function has hooks for minotaurx
bool register_minotaur_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_minotaur;
gate->hash = (void*)&minotaur_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->miner_thread_init = (void*)&initialize_torture_garden;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT;
return true;
};

View File

@@ -1136,10 +1136,14 @@ int yespower(yespower_local_t *local,
ctx.S0 = S;
ctx.S1 = S + Swidth_to_Sbytes1( Swidth );
// copy prehash, do tail
if ( srclen == 80 ) // assume 64 byte prehash was done
{
memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
sha256_update( &sha256_ctx, src+64, srclen-64 );
sha256_final( &sha256_ctx, sha256 );
}
else
sha256_full( sha256, src, srclen );
if ( version == YESPOWER_0_5 )
{

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.3.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.0.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.20.3'
PACKAGE_STRING='cpuminer-opt 3.20.3'
PACKAGE_VERSION='3.21.0'
PACKAGE_STRING='cpuminer-opt 3.21.0'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.20.3 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.21.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.20.3:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.21.0:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.20.3
cpuminer-opt configure 3.21.0
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.20.3, which was
It was created by cpuminer-opt $as_me 3.21.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.20.3'
VERSION='3.21.0'
cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.20.3, which was
This file was extended by cpuminer-opt $as_me 3.21.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.20.3
cpuminer-opt config.status 3.21.0
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.20.3])
AC_INIT([cpuminer-opt], [3.21.0])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -131,10 +131,9 @@ bool opt_verify = false;
static bool opt_stratum_keepalive = false;
static struct timeval stratum_keepalive_timer;
// Stratum typically times out in 5 minutes or 300 seconds
#define stratum_keepalive_timeout 180 // 3 minutes
#define stratum_keepalive_timeout 150 // 2.5 minutes
static struct timeval stratum_reset_time;
// pk_buffer_size is used as a version selector by b58 code, therefore
// it must be set correctly to work.
const int pk_buffer_size_max = 26;
@@ -2192,6 +2191,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
} // !quiet
} // new diff/block
/*
if ( new_job && !( opt_quiet || stratum_errors ) )
{
int mismatch = submitted_share_count - ( accepted_share_count
@@ -2202,6 +2202,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
CL_LBL "%d Submitted share pending, maybe stale" CL_N,
submitted_share_count );
}
*/
}
static void *miner_thread( void *userdata )
@@ -2446,8 +2447,8 @@ static void *miner_thread( void *userdata )
{
scale_hash_for_display( &hashrate, hr_units );
sprintf( hr, "%.2f", hashrate );
applog( LOG_INFO, "CPU #%d: %s %sh/s",
thr_id, hr, hr_units );
applog( LOG_INFO, "Thread %d, CPU %d: %s %sh/s",
thr_id, thread_affinity_map[ thr_id ], hr, hr_units );
}
}
@@ -2887,7 +2888,7 @@ static void *stratum_thread(void *userdata )
else
timeval_subtract( &et, &now, &stratum_reset_time );
if ( et.tv_sec > stratum_keepalive_timeout + 60 )
if ( et.tv_sec > stratum_keepalive_timeout + 90 )
{
applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
stratum_need_reset = true;

View File

@@ -559,6 +559,7 @@ enum algos {
ALGO_LYRA2Z330,
ALGO_M7M,
ALGO_MINOTAUR,
ALGO_MINOTAURX,
ALGO_MYR_GR,
ALGO_NEOSCRYPT,
ALGO_NIST5,
@@ -652,6 +653,7 @@ static const char* const algo_names[] = {
"lyra2z330",
"m7m",
"minotaur",
"minotaurx",
"myr-gr",
"neoscrypt",
"nist5",
@@ -813,6 +815,7 @@ Options:\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
minotaur\n\
minotaurx\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\

View File

@@ -193,8 +193,17 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Basic operations without equivalent SIMD intrinsic
// Bitwise not (~v)
#if defined(__AVX512VL__)
static inline __m128i mm128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
#else
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
#endif
// Unary negation of elements (-v)
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
@@ -439,7 +448,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from src a, and the high half from src b.
// half is always taken from v1, and the high half from v2.
#define mm128_shuffle2_64( v1, v2, c ) \
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
_mm_castsi128_pd( v2 ), c ) );
@@ -600,9 +609,6 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#endif // SSSE3 else SSE2
//
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
// Swap 128 bit vectors.
// This should be avoided, it's more efficient to switch references.
#define mm128_swap256_128( v1, v2 ) \
@@ -611,61 +617,23 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v1 = _mm_xor_si128( v1, v2 );
// Two input shuffle-rotate.
// Concatenate v1 & v2 and byte rotate as a 256 bit vector.
// Function macros with two inputs and one output, inputs are preserved.
// Returns the high 128 bits, ie updated v1.
// alignr for 32 & 64 bit elements is only available with AVX512 but
// emulated here. Shift argument is not needed, it's always 1.
// Behaviour is otherwise consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
/*
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 1 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 1 )
*/
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
#else
#define mm128_shufl2r_64( v1, v2 ) \
_mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) )
#define mm128_shufl2l_64( v1, v2 ) \
_mm_or_si128( _mm_slli_si128( v1, 8 ), \
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) )
/*
#define mm128_shufl2r_32( v1, v2 ) \
_mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) )
#define mm128_shufl2l_32( v1, v2 ) \
_mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) )
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 4 ) )
#define mm128_shufl2r_16( v1, v2 ) \
_mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) )
#define mm128_shufl2l_16( v1, v2 ) \
_mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) )
#define mm128_shufl2r_8( v1, v2 ) \
_mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) )
#define mm128_shufl2l_8( v1, v2 ) \
_mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) )
*/
#endif
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
@@ -689,50 +657,6 @@ do { \
v1 = t; \
} while(0)
/*
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
v1 = t; \
} while(0)
*/
#else // SSE2
#define mm128_vror256_64( v1, v2 ) \
@@ -752,61 +676,7 @@ do { \
_mm_srli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
/*
#define mm128_vror256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
_mm_slli_si128( v1, 12 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
_mm_srli_si128( v1, 12 ) ); \
v1 = t; \
} while(0)
#define mm128_vror256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
_mm_slli_si128( v1, 14 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
_mm_srli_si128( v1, 14 ) ); \
v1 = t; \
} while(0)
#define mm128_vror256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
_mm_slli_si128( v1, 15 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
_mm_srli_si128( v1, 15 ) ); \
v1 = t; \
} while(0)
*/
#endif // SSE4.1 else SSE2
#endif // __SSE2__

View File

@@ -15,14 +15,13 @@
//
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
// lanes and data can't cross the 128 bit lane boundary.
// Some usage may have the index vector encoded as if full vector
// shuffles are supported. This has no side effects and would have the same
// results using either version.
// If the need arises and AVX512VL is available, 256 bit full vector shuffles
// can be implemented using the AVX512 zero-mask feature with a NULL mask.
// Using intrinsics it's simple: _mm256_maskz_shuffle_epi8( 0, v, c )
// With asm it's a bit more complicated with the addition of the mask register
// and zero tag: vpshufb ymm0{k0}{z}, ymm1, ymm2
// Instructions that can move data across 128 bit lane boundary incur a
// performance penalty over those that can't.
// Some usage of index vectors may be encoded as if full vector shuffles are
// supported. This has no side effects and would have the same results using
// either version.
// If the need arises and AVX512VL is available, 256 bit full vector byte
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
#if defined(__AVX__)
@@ -141,7 +140,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Basic operations without SIMD equivalent
// Bitwise not ( ~v )
#if defined(__AVX512VL__)
static inline __m256i mm256_not( const __m256i v )

View File

@@ -37,13 +37,21 @@
// version of this specific instruction does not.
//
// New alignr instructions for epi64 and epi32 operate across the entire
// vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes.
// vector but slower than epi8 which continues to be restricted to 128 bit
// lanes.
//
// "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
// AVX512-VBMI. The same instructions with larger elements don't have this
// requirement. "_mm512_permutexvar_epi8" also performs the same operation
// as "_mm512_shuffle_epi8" which only requires AVX512-BW.
//
// Two coding conventions are used to prevent macro argument side effects:
// - if a macro arg is used in an expression it must be protected by
// parentheses to ensure an expression argument is evaluated first.
// - if an argument is to referenced multiple times a C inline function
// should be used instead of a macro to prevent an expression argument
// from being evaluated multiple times.
//
// There are 2 areas where overhead is a major concern: constants and
// permutations.
//
@@ -184,7 +192,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
// Basic operations without SIMD equivalent
// Bitwise NOT: ~x
// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
static inline __m512i mm512_not( const __m512i x )
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
@@ -295,7 +302,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_nand( a, b ) \
_mm512_ternarylogic_epi64( a, b, b, 0xef )
/*
// Diagonal blending
// Blend 8 64 bit elements from 8 vectors
#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
@@ -313,6 +320,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
_mm512_mask_blend_epi32( 0x3333, \
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
*/
/*
//
@@ -374,6 +382,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32
/*
#if defined(__AVX512VBMI2__)
// Use C inline function in case arg is coded as an expression.
static inline __m512i mm512_ror_16( __m512i v, int c )
{ return _mm512_shrdi_epi16( v, v, c ); }
static inline __m512i mm512_rol_16( __m512i v, int c )
{ return _mm512_shldi_epi16( v, v, c ); }
#endif
*/
//
// Reverse byte order of packed elements, vectorized endian conversion.
@@ -518,7 +539,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Rotate elements within 256 bit lanes of 512 bit vector.
// 128 bit lane shift is handled by bslli bsrli.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
@@ -623,22 +643,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
/*
// 2 input, 1 output
// Concatenate { v1, v2 } then rotate right or left and return the high
// 512 bits, ie rotated v1.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
#define mm512_shufl2r_128( v1, v2 ) _mm512_alignr_epi64( v2, v1, 2 )
#define mm512_shufl2l_128( v1, v2 ) _mm512_alignr_epi64( v1, v2, 2 )
#define mm512_shufl2r_64( v1, v2 ) _mm512_alignr_epi64( v2, v1, 1 )
#define mm512_shufl2l_64( v1, v2 ) _mm512_alignr_epi64( v1, v2, 1 )
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
*/
#endif // AVX512
#endif // SIMD_512_H__