mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.20.3
This commit is contained in:
@@ -205,7 +205,6 @@ cpuminer_SOURCES = \
|
||||
algo/verthash/tiny_sha3/sha3.c \
|
||||
algo/verthash/tiny_sha3/sha3-4way.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
|
16
README.md
16
README.md
@@ -40,17 +40,25 @@ Requirements
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
|
||||
ARM and Aarch64 CPUs are not supported.
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
|
31
README.txt
31
README.txt
@@ -1,12 +1,22 @@
|
||||
This file is included in the Windows binary package. Compile instructions
|
||||
for Linux and Windows can be found in RELEASE_NOTES.
|
||||
|
||||
This package is officially avalable only from:
|
||||
cpuminer-opt is open source and free of any fees. Many forks exist that are
|
||||
closed source and contain usage fees. support open source free software.
|
||||
|
||||
This package is officially avalaible only from:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt
|
||||
|
||||
No other sources should be trusted.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS or Powershell
|
||||
prompt. There is no GUI and no mouse support.
|
||||
command prompt. There is no GUI and no mouse support.
|
||||
|
||||
New users are encouraged to consult the cpuminer-opt Wiki for detailed
|
||||
information on usage:
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
@@ -43,12 +53,11 @@ cpuminer-avx2.exe Haswell, Skylake, Kabylake, Coffeelake, Cometlake
|
||||
cpuminer-avx2-sha.exe AMD Zen1, Zen2
|
||||
cpuminer-avx2-sha-vaes.exe Intel Alderlake*, AMD Zen3
|
||||
cpuminer-avx512.exe Intel HEDT Skylake-X, Cascadelake
|
||||
cpuminer-avx512-sha-vaes.exe Icelake, Tigerlake, Rocketlake
|
||||
cpuminer-avx512-sha-vaes.exe AMD Zen4, Intel Rocketlake, Icelake
|
||||
|
||||
* Alderlake is a hybrid architecture. With the E-cores disabled it may be
|
||||
possible to enable AVX512 on the the P-cores and use the avx512-sha-vaes
|
||||
build. This is not officially supported by Intel at time of writing.
|
||||
Check for current information.
|
||||
* Alderlake is a hybrid architecture with a mix of E-cores & P-cores. Although
|
||||
the P-cores can support AVX512 the E-cores can't so Intel decided to disable
|
||||
AVX512 on the the P-cores.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
@@ -59,9 +68,11 @@ source code obtained from the author's official repository. The exact
|
||||
procedure is documented in the build instructions for Windows:
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Some DLL filess may already be installed on the system by Windows or third
|
||||
party packages. They often will work and may be used instead of the included
|
||||
file.
|
||||
Some included DLL files may already be installed on the system by Windows or
|
||||
third party packages. They often will work and may be used instead of the
|
||||
included version of the files.
|
||||
|
||||
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.20.3
|
||||
|
||||
Faster c11 algo: AVX512 6%, AVX2 4%, AVX2+VAES 15%.
|
||||
Faster AVX2+VAES for anime 14%, hmq1725 6%.
|
||||
Small optimizations to Luffa AVX2 & AVX512.
|
||||
|
||||
v3.20.2
|
||||
|
||||
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
|
||||
@@ -75,7 +81,7 @@ v3.20.1
|
||||
sph_blake2b optimized 1-way SSSE3 & AVX2.
|
||||
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
|
||||
Removed imprecise hash & target display from rejected share log.
|
||||
Share and target difficulty is now displayed only for low diificulty shares.
|
||||
Share and target difficulty is now displayed only for low difficulty shares.
|
||||
Updated configure.ac to check for AVX512 asm support.
|
||||
Small optimization to Lyra2 SSE2.
|
||||
|
||||
|
@@ -67,7 +67,6 @@ void do_nothing () {}
|
||||
bool return_true () { return true; }
|
||||
bool return_false () { return false; }
|
||||
void *return_null () { return NULL; }
|
||||
void call_error () { printf("ERR: Uninitialized function pointer\n"); }
|
||||
|
||||
void algo_not_tested()
|
||||
{
|
||||
@@ -95,7 +94,8 @@ int null_scanhash()
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Default generic scanhash can be used in many cases.
|
||||
// Default generic scanhash can be used in many cases. Not to be used when
|
||||
// prehashing can be done or when byte swapping the data can be avoided.
|
||||
int scanhash_generic( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -152,6 +152,9 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
// overwrite byte swapped nonce with original byte order for proper
|
||||
// incrementing. The nonce only needs to byte swapped if it is to be
|
||||
// sumbitted.
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
|
@@ -316,7 +316,7 @@ static const sph_u32 CS[16] = {
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
@@ -324,7 +324,7 @@ static const sph_u32 CS[16] = {
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shufll_32( V3 ); \
|
||||
@@ -335,7 +335,7 @@ static const sph_u32 CS[16] = {
|
||||
CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
|
||||
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
@@ -343,7 +343,7 @@ static const sph_u32 CS[16] = {
|
||||
CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
|
||||
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shuflr_32( V3 ); \
|
||||
|
@@ -78,7 +78,8 @@
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
@@ -115,6 +116,7 @@
|
||||
}
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
|
@@ -747,38 +747,40 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mj[14] = mm256_rol_64( M[14], 15 );
|
||||
mj[15] = mm256_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m256i)_mm256_set1_epi64x( 17 * 0x0555555555555555ULL ) );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m256i)_mm256_set1_epi64x( 18 * 0x0555555555555555ULL ) );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m256i)_mm256_set1_epi64x( 19 * 0x0555555555555555ULL ) );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m256i)_mm256_set1_epi64x( 20 * 0x0555555555555555ULL ) );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m256i)_mm256_set1_epi64x( 21 * 0x0555555555555555ULL ) );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m256i)_mm256_set1_epi64x( 22 * 0x0555555555555555ULL ) );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m256i)_mm256_set1_epi64x( 23 * 0x0555555555555555ULL ) );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m256i)_mm256_set1_epi64x( 24 * 0x0555555555555555ULL ) );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m256i)_mm256_set1_epi64x( 25 * 0x0555555555555555ULL ) );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m256i)_mm256_set1_epi64x( 26 * 0x0555555555555555ULL ) );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m256i)_mm256_set1_epi64x( 27 * 0x0555555555555555ULL ) );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m256i)_mm256_set1_epi64x( 28 * 0x0555555555555555ULL ) );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m256i)_mm256_set1_epi64x( 29 * 0x0555555555555555ULL ) );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m256i)_mm256_set1_epi64x( 30 * 0x0555555555555555ULL ) );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m256i)_mm256_set1_epi64x( 31 * 0x0555555555555555ULL ) );
|
||||
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
|
||||
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm256_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm256_add_epi64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = _mm256_add_epi64( qt[17], expand1_b( qt, 17 ) );
|
||||
@@ -1180,7 +1182,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
|
||||
__m512i mj[16];
|
||||
uint64_t K = 16 * 0x0555555555555555ULL;
|
||||
|
||||
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||
@@ -1199,54 +1200,40 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
mj[14] = mm512_rol_64( M[14], 15 );
|
||||
mj[15] = mm512_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
|
||||
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = _mm512_add_epi64( K, Kincr );
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
|
@@ -62,186 +62,66 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
|
||||
#define cns4w(i) m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT4W(a,b,c0,c1)\
|
||||
a = _mm512_xor_si512(a,c0);\
|
||||
b = _mm512_xor_si512(b,c1);
|
||||
#define ADD_CONSTANT4W( a, b, c0, c1 ) \
|
||||
a = _mm512_xor_si512( a, c0 ); \
|
||||
b = _mm512_xor_si512( b, c1 );
|
||||
|
||||
#define MULT24W( a0, a1 ) \
|
||||
do { \
|
||||
{ \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \
|
||||
_mm512_bslli_epi128( a1,12 ) ); \
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
|
||||
_mm512_bslli_epi128( b,12 ) ); \
|
||||
} while(0)
|
||||
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define MULT24W( a0, a1, mask ) \
|
||||
do { \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART4W(x,c0,c1,t)\
|
||||
SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD4W(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = a0;\
|
||||
#define SUBCRUMB4W( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m512i t = a0; \
|
||||
a0 = mm512_xoror( a3, a0, a1 ); \
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a2 = _mm512_xor_si512( a2, a3 ); \
|
||||
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm512_xorand( a2, a3, t ); \
|
||||
a2 = mm512_xorand( a1, a2, a0);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = mm512_xnor(a1,a0);\
|
||||
a0 = t;
|
||||
a2 = mm512_xorand( a1, a2, a0); \
|
||||
a1 = _mm512_or_si512( a1, a3 ); \
|
||||
a3 = _mm512_xor_si512( a3, a2 ); \
|
||||
t = _mm512_xor_si512( t, a1 ); \
|
||||
a2 = _mm512_and_si512( a2, a1 ); \
|
||||
a1 = mm512_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
/*
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = _mm512_load_si512(&a0);\
|
||||
a0 = _mm512_or_si512(a0,a1);\
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a1 = _mm512_andnot_si512(a1, m512_neg1 );\
|
||||
a0 = _mm512_xor_si512(a0,a3);\
|
||||
a3 = _mm512_and_si512(a3,t);\
|
||||
a1 = _mm512_xor_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a0);\
|
||||
a0 = _mm512_andnot_si512(a0, m512_neg1 );\
|
||||
a2 = _mm512_xor_si512(a2,a1);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = _mm512_xor_si512(a1,a0);\
|
||||
a0 = _mm512_load_si512(&t);
|
||||
*/
|
||||
#define MIXWORD4W( a, b ) \
|
||||
b = _mm512_xor_si512( a, b ); \
|
||||
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 2 ) ); \
|
||||
b = _mm512_xor_si512( a, _mm512_rol_epi32( b, 14 ) ); \
|
||||
a = _mm512_xor_si512( b, _mm512_rol_epi32( a, 10 ) ); \
|
||||
b = _mm512_rol_epi32( b, 1 );
|
||||
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = mm512_xoror( a, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
#define STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB4W( x0, x1, x2, x3 ); \
|
||||
SUBCRUMB4W( x5, x6, x7, x4 ); \
|
||||
MIXWORD4W( x0, x4 ); \
|
||||
MIXWORD4W( x1, x5 ); \
|
||||
MIXWORD4W( x2, x6 ); \
|
||||
MIXWORD4W( x3, x7 ); \
|
||||
ADD_CONSTANT4W( x0, x4, c0, c1 );
|
||||
|
||||
/*
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
*/
|
||||
|
||||
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm512_shuffle_epi32(a1,147);\
|
||||
t0 = _mm512_load_si512(&a1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm512_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm512_shuffle_epi32(t0,78);\
|
||||
a0 = _mm512_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm512_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm512_load_si512(&a1);\
|
||||
a0 = _mm512_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm512_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm512_shuffle_epi32(a1,57);\
|
||||
MIXWORD4W(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT4W(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm512_load_si512(&r1);\
|
||||
q2 = _mm512_load_si512(&p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,216);\
|
||||
p2 = _mm512_shuffle_epi32(p2,216);\
|
||||
r1 = _mm512_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm512_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm512_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm512_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm512_load_si512(&r2);\
|
||||
q0 = _mm512_load_si512(&p2);\
|
||||
r2 = _mm512_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm512_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm512_load_si512(&s0);\
|
||||
q1 = _mm512_load_si512(&q0);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,225);\
|
||||
p2 = _mm512_shuffle_epi32(p2,225);\
|
||||
r0 = _mm512_load_si512(&s1);\
|
||||
p0 = _mm512_load_si512(&q1);\
|
||||
s0 = _mm512_shuffle_epi32(s0,225);\
|
||||
q0 = _mm512_shuffle_epi32(q0,225);\
|
||||
s1 = _mm512_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm512_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s3 = _mm512_load_si512(&r2);\
|
||||
q3 = _mm512_load_si512(&p2);
|
||||
|
||||
#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm512_load_si512(&r0);\
|
||||
q0 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r2);\
|
||||
q1 = _mm512_load_si512(&p2);\
|
||||
r0 = _mm512_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm512_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm512_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm512_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm512_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm512_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm512_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm512_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm512_load_si512(&r0);\
|
||||
p1 = _mm512_load_si512(&p0);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm512_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm512_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r1);\
|
||||
q1 = _mm512_load_si512(&p1);
|
||||
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm512_shuffle_epi32( a1, 147 ); \
|
||||
t0 = _mm512_load_si512( &a1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm512_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm512_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB4W( t1, t0, a0, a1 ); \
|
||||
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm512_load_si512( &a1 ); \
|
||||
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
|
||||
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm512_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD4W( a0, a1 ); \
|
||||
ADD_CONSTANT4W( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm512_load_si512(&r3);\
|
||||
@@ -279,8 +159,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
__m512i t0, t1;
|
||||
__m512i *chainv = state->chainv;
|
||||
__m512i msg0, msg1;
|
||||
__m512i tmp[2];
|
||||
__m512i x[8];
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
@@ -372,42 +251,30 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||
chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
|
||||
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 0), cns4w( 1) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 2), cns4w( 3) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 4), cns4w( 5) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 6), cns4w( 7) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w( 8), cns4w( 9) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(10), cns4w(11) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(12), cns4w(13) );
|
||||
STEP_PART4W( x0, x1, x2, x3, x4, x5, x6, x7, cns4w(14), cns4w(15) );
|
||||
|
||||
MIXTON10244W( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
MIXTON10244W( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7] );
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29) );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31) );
|
||||
}
|
||||
|
||||
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
@@ -683,10 +550,11 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);
|
||||
#define ADD_CONSTANT( a, b, c0, c1 ) \
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
/*
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
@@ -694,127 +562,83 @@ do { \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
{ \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
|
||||
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = a0;\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = mm256_not( a1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = mm256_not( a0 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = t;\
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m256i t = a0; \
|
||||
a0 = _mm256_or_si256( a0, a1 ); \
|
||||
a2 = _mm256_xor_si256( a2, a3 ); \
|
||||
a1 = mm256_not( a1 ); \
|
||||
a0 = _mm256_xor_si256( a0, a3 ); \
|
||||
a3 = _mm256_and_si256( a3, t ); \
|
||||
a1 = _mm256_xor_si256( a1, a3 ); \
|
||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||
a2 = _mm256_and_si256( a2, a0 ); \
|
||||
a0 = mm256_not( a0 ); \
|
||||
a2 = _mm256_xor_si256( a2, a1 ); \
|
||||
a1 = _mm256_or_si256( a1, a3 ); \
|
||||
t = _mm256_xor_si256( t, a1 ); \
|
||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||
a2 = _mm256_and_si256( a2, a1 ); \
|
||||
a1 = _mm256_xor_si256( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
#define MIXWORD( a, b ) \
|
||||
{ \
|
||||
__m256i t1, t2; \
|
||||
b = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( a, 2 ); \
|
||||
t2 = _mm256_srli_epi32( a, 30 ); \
|
||||
a = _mm256_or_si256( t1, t2 ); \
|
||||
a = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( b, 14 ); \
|
||||
t2 = _mm256_srli_epi32( b, 18 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
b = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( a, 10 ); \
|
||||
t2 = _mm256_srli_epi32( a, 22 ); \
|
||||
a = _mm256_or_si256( t1,t2 ); \
|
||||
a = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( b,1 ); \
|
||||
t2 = _mm256_srli_epi32( b,31 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
}
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB( x0, x1, x2, x3 ); \
|
||||
SUBCRUMB( x5, x6, x7, x4 ); \
|
||||
MIXWORD( x0, x4 ); \
|
||||
MIXWORD( x1, x5 ); \
|
||||
MIXWORD( x2, x6 ); \
|
||||
MIXWORD( x3, x7 ); \
|
||||
ADD_CONSTANT( x0, x4, c0, c1 );
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm256_shuffle_epi32( a1, 147); \
|
||||
t0 = _mm256_load_si256( &a1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm256_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm256_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB( t1, t0, a0, a1 );\
|
||||
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm256_load_si256( &a1 ); \
|
||||
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
|
||||
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm256_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD( a0, a1 ); \
|
||||
ADD_CONSTANT( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
@@ -857,9 +681,8 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
const __m256i MASK = m256_const1_i128( 0xffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
@@ -958,42 +781,30 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
chainv[7] = mm256_rol_32( chainv[7], 3 );
|
||||
chainv[9] = mm256_rol_32( chainv[9], 4 );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
|
||||
}
|
||||
|
||||
/***************************************************/
|
||||
|
@@ -30,19 +30,6 @@
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
static inline __m256i mult2_avx2( a )
|
||||
{
|
||||
__m128 a0, a0, b;
|
||||
a0 = mm128_extractlo_256( a );
|
||||
a1 = mm128_extracthi_256( a );
|
||||
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
|
||||
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
|
||||
return mm256_concat_128( a1, a0 );
|
||||
}
|
||||
*/
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
|
@@ -15,7 +15,8 @@
|
||||
|
||||
#if defined (ANIME_8WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
@@ -26,23 +27,9 @@ typedef struct {
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
} anime_8way_ctx_holder;
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_anime_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &anime_8way_ctx.blake );
|
||||
bmw512_8way_init( &anime_8way_ctx.bmw );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &anime_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
skein512_8way_init( &anime_8way_ctx.skein );
|
||||
jh512_8way_init( &anime_8way_ctx.jh );
|
||||
keccak512_8way_init( &anime_8way_ctx.keccak );
|
||||
}
|
||||
typedef union _anime_8way_context_overlay anime_8way_context_overlay;
|
||||
|
||||
void anime_8way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -65,17 +52,14 @@ void anime_8way_hash( void *state, const void *input )
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__m512i* vhC = (__m512i*)vhashC;
|
||||
const __m512i bit3_mask = m512_const1_64( 8 );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
__mmask8 vh_mask;
|
||||
anime_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
|
||||
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -152,8 +136,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -168,8 +151,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
@@ -237,14 +219,20 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (ANIME_4WAY)
|
||||
|
||||
typedef struct {
|
||||
union _anime_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_4way_context jh;
|
||||
skein512_4way_context skein;
|
||||
keccak512_4way_context keccak;
|
||||
} anime_4way_ctx_holder;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _anime_4way_context_overlay anime_4way_context_overlay;
|
||||
|
||||
void anime_4way_hash( void *state, const void *input )
|
||||
{
|
||||
@@ -262,7 +250,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
int h_mask;
|
||||
const __m256i bit3_mask = m256_const1_64( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
anime_4way_ctx_holder ctx;
|
||||
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, input, 80 );
|
||||
@@ -293,7 +281,18 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
@@ -302,6 +301,8 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
@@ -98,8 +99,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -154,8 +154,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
@@ -174,8 +173,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
|
||||
{
|
||||
@@ -223,8 +221,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
// 4x32 for haval
|
||||
@@ -302,8 +299,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -374,8 +370,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
// A
|
||||
#if defined(__VAES__)
|
||||
@@ -455,8 +450,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7 );
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
|
||||
if ( hash0[0] & mask )
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
|
||||
@@ -520,8 +514,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_8way_close( &ctx.sha512, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
|
||||
m512_zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
@@ -625,6 +618,7 @@ union _hmq1725_4way_context_overlay
|
||||
cube_2way_context cube2;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd sd;
|
||||
shavite512_2way_context shavite2;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
@@ -633,6 +627,10 @@ union _hmq1725_4way_context_overlay
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_4way_context sha512;
|
||||
haval256_5_4way_context haval;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl2;
|
||||
echo_2way_context echo2;
|
||||
#endif
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
|
||||
@@ -750,15 +748,10 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
|
||||
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
|
||||
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
|
||||
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
|
||||
|
||||
intrlv_2x128_512( vhashA, hash0, hash1 );
|
||||
intrlv_2x128_512( vhashB, hash2, hash3 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
@@ -795,6 +788,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
echo_2way_full( &ctx.echo2, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo2, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
@@ -807,7 +811,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
(const BitSequence *)hash3, 64 );
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
@@ -939,6 +945,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl2, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl2, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -948,6 +965,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
sha512_4way_init( &ctx.sha512 );
|
||||
sha512_4way_update( &ctx.sha512, vhash, 64 );
|
||||
sha512_4way_close( &ctx.sha512, vhash );
|
||||
|
@@ -68,7 +68,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
@@ -76,9 +75,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -154,8 +151,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
||||
@@ -169,8 +165,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
vh_mask = _mm512_testn_epi64_mask( vh[0], bit3_mask );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
|
@@ -1,291 +0,0 @@
|
||||
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
|
||||
/*
|
||||
* This file contains some functions which implement the external data
|
||||
* handling and padding for Merkle-Damgard hash functions which follow
|
||||
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
|
||||
*
|
||||
* API: this file is meant to be included, not compiled as a stand-alone
|
||||
* file. Some macros must be defined:
|
||||
* RFUN name for the round function
|
||||
* HASH "short name" for the hash function
|
||||
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
|
||||
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
|
||||
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
|
||||
* LE64 defined for little-endian, 64-bit based (no example yet)
|
||||
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
|
||||
* BLEN if defined, length of a message block (in bytes)
|
||||
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
|
||||
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
|
||||
* SVAL if defined, reference to the context state information
|
||||
*
|
||||
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
|
||||
* this is used for instance for Tiger, which works on 64-bit words but
|
||||
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
|
||||
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
|
||||
* set, then only one word (64 bits) will be used to encode the input
|
||||
* message length (in bits), otherwise two words will be used (as in
|
||||
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
|
||||
* not PLW1), four 64-bit words will be used to encode the message length
|
||||
* (in bits). Note that regardless of those settings, only 64-bit message
|
||||
* lengths are supported (in bits): messages longer than 2 Exabytes will be
|
||||
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
|
||||
* 2 millions Terabytes, which is huge).
|
||||
*
|
||||
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
|
||||
* function. This is used for Tiger2, which is identical to Tiger except
|
||||
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
|
||||
* of the 0x01 from original Tiger).
|
||||
*
|
||||
* The RFUN function is invoked with two arguments, the first pointing to
|
||||
* aligned data (as a "const void *"), the second being state information
|
||||
* from the context structure. By default, this state information is the
|
||||
* "val" field from the context, and this field is assumed to be an array
|
||||
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
|
||||
* from the context structure. The "val" field can have any type, except
|
||||
* for the output encoding which assumes that it is an array of "sph_u32"
|
||||
* values. By defining NO_OUTPUT, this last step is deactivated; the
|
||||
* includer code is then responsible for writing out the hash result. When
|
||||
* NO_OUTPUT is defined, the third parameter to the "close()" function is
|
||||
* ignored.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
#undef SPH_BLEN
|
||||
#undef SPH_WLEN
|
||||
#if defined BE64 || defined LE64
|
||||
#define SPH_BLEN 128U
|
||||
#define SPH_WLEN 8U
|
||||
#else
|
||||
#define SPH_BLEN 64U
|
||||
#define SPH_WLEN 4U
|
||||
#endif
|
||||
|
||||
#ifdef BLEN
|
||||
#undef SPH_BLEN
|
||||
#define SPH_BLEN BLEN
|
||||
#endif
|
||||
|
||||
#undef SPH_MAXPAD
|
||||
#if defined PLW1
|
||||
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
|
||||
#elif defined PLW4
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
|
||||
#else
|
||||
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
|
||||
#endif
|
||||
|
||||
#undef SPH_VAL
|
||||
#undef SPH_NO_OUTPUT
|
||||
#ifdef SVAL
|
||||
#define SPH_VAL SVAL
|
||||
#define SPH_NO_OUTPUT 1
|
||||
#else
|
||||
#define SPH_VAL sc->val
|
||||
#endif
|
||||
|
||||
#ifndef CLOSE_ONLY
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
static void
|
||||
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
|
||||
#else
|
||||
void
|
||||
HASH ( void *cc, const void *data, size_t len )
|
||||
#endif
|
||||
{
|
||||
SPH_XCAT( HASH, _context ) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = SPH_BLEN - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == SPH_BLEN )
|
||||
{
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
ptr = 0;
|
||||
}
|
||||
sc->count += clen;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SPH_UPTR
|
||||
void
|
||||
HASH (void *cc, const void *data, size_t len)
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned ptr;
|
||||
|
||||
if ( len < (2 * SPH_BLEN) )
|
||||
{
|
||||
SPH_XCAT(HASH, _short)(cc, data, len);
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
if ( ptr > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
t = SPH_BLEN - ptr;
|
||||
SPH_XCAT( HASH, _short )( cc, data, t );
|
||||
vdata = vdata + (t>>3);
|
||||
len -= t;
|
||||
}
|
||||
SPH_XCAT( HASH, _short )( cc, data, len );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Perform padding and produce result. The context is NOT reinitialized
|
||||
* by this function.
|
||||
*/
|
||||
static void
|
||||
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
|
||||
void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT(HASH, _context) *sc;
|
||||
unsigned ptr, u;
|
||||
sc = cc;
|
||||
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
|
||||
|
||||
//uint64_t *b= (uint64_t*)sc->buf;
|
||||
//uint64_t *s= (uint64_t*)sc->state;
|
||||
//printf("Vptr 1= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
|
||||
#ifdef PW01
|
||||
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
|
||||
// sc->buf[ptr++] = 0x100 >> 8;
|
||||
#else
|
||||
// need to overwrite exactly one byte
|
||||
// sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
|
||||
sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
|
||||
// ptr++;
|
||||
#endif
|
||||
ptr += 8;
|
||||
|
||||
//printf("Vptr 2= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
|
||||
if ( ptr > SPH_MAXPAD )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
|
||||
}
|
||||
#if defined BE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD>>3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#elif defined PLW4
|
||||
memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
|
||||
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#else
|
||||
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
#endif // PLW
|
||||
#else // LE64
|
||||
#if defined PLW1
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
#elif defined PLW4
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( c->count >> 61 );
|
||||
memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
|
||||
2 * SPH_WLEN );
|
||||
#else
|
||||
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
|
||||
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
|
||||
_mm256_set1_epi64x( sc->count >> 61 );
|
||||
#endif // PLW
|
||||
|
||||
#endif // LE64
|
||||
|
||||
//printf("Vptr 3= %u\n", ptr);
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
|
||||
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
|
||||
RFUN( sc->buf, SPH_VAL );
|
||||
|
||||
//printf("Vptr after= %u\n", ptr);
|
||||
//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
|
||||
//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
|
||||
|
||||
#ifdef SPH_NO_OUTPUT
|
||||
(void)dst;
|
||||
(void)rnum;
|
||||
(void)u;
|
||||
#else
|
||||
for ( u = 0; u < rnum; u ++ )
|
||||
{
|
||||
#if defined BE64
|
||||
((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
|
||||
#else // LE64
|
||||
((__m256i*)dst)[u] = sc->val[u];
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
|
||||
{
|
||||
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@@ -1,108 +0,0 @@
|
||||
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* WHIRLPOOL interface.
|
||||
*
|
||||
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
|
||||
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
|
||||
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
|
||||
* version, 2003, with a new diffusion matrix, also described as "plain
|
||||
* WHIRLPOOL"). All three variants are implemented here.
|
||||
*
|
||||
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
|
||||
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
|
||||
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
|
||||
*
|
||||
* The current WHIRLPOOL specification and a reference implementation
|
||||
* can be found on the WHIRLPOOL web page:
|
||||
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_whirlpool.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef WHIRLPOOL_HASH_4WAY_H__
|
||||
#define WHIRLPOOL_HASH_4WAY_H__
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool 512
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL-0.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool0 512
|
||||
|
||||
/**
|
||||
* Output size (in bits) for WHIRLPOOL-1.
|
||||
*/
|
||||
#define SPH_SIZE_whirlpool1 512
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
__m256i state[8];
|
||||
sph_u64 count;
|
||||
} whirlpool_4way_context;
|
||||
|
||||
void whirlpool_4way_init( void *cc );
|
||||
|
||||
void whirlpool_4way( void *cc, const void *data, size_t len );
|
||||
|
||||
void whirlpool_4way_close( void *cc, void *dst );
|
||||
|
||||
/**
|
||||
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
|
||||
*/
|
||||
typedef whirlpool_4way_context whirlpool0_4way_context;
|
||||
|
||||
#define whirlpool0_4way_init whirlpool_4way_init
|
||||
|
||||
void whirlpool0_4way( void *cc, const void *data, size_t len );
|
||||
|
||||
void whirlpool0_4way_close( void *cc, void *dst );
|
||||
|
||||
/**
|
||||
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
|
||||
*/
|
||||
typedef whirlpool_4way_context whirlpool1_4way_context;
|
||||
|
||||
#define whirlpool1_4way_init whirlpool_4way_init
|
||||
|
||||
void whirlpool1_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void whirlpool1_4way_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -12,6 +12,7 @@
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
@@ -22,15 +23,15 @@
|
||||
|
||||
#if defined (C11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
union _c11_8way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
cube_4way_2buf_context cube;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
@@ -40,32 +41,14 @@ typedef struct {
|
||||
sph_shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} c11_8way_ctx_holder;
|
||||
simd_4way_context simd;
|
||||
} __attribute__ ((aligned (64)));
|
||||
typedef union _c11_8way_context_overlay c11_8way_context_overlay;
|
||||
|
||||
c11_8way_ctx_holder c11_8way_ctx;
|
||||
static __thread __m512i c11_8way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
|
||||
|
||||
void init_c11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &c11_8way_ctx.blake );
|
||||
bmw512_8way_init( &c11_8way_ctx.bmw );
|
||||
skein512_8way_init( &c11_8way_ctx.skein );
|
||||
jh512_8way_init( &c11_8way_ctx.jh );
|
||||
keccak512_8way_init( &c11_8way_ctx.keccak );
|
||||
luffa_4way_init( &c11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &c11_8way_ctx.shavite );
|
||||
echo_4way_init( &c11_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &c11_8way_ctx.shavite );
|
||||
init_echo( &c11_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11_8way_hash( void *state, const void *input )
|
||||
int c11_8way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
@@ -78,24 +61,19 @@ void c11_8way_hash( void *state, const void *input )
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
c11_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
|
||||
c11_8way_context_overlay ctx;
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
|
||||
c11_8way_midstate );
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
@@ -104,21 +82,14 @@ void c11_8way_hash( void *state, const void *input )
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
@@ -126,83 +97,56 @@ void c11_8way_hash( void *state, const void *input )
|
||||
#endif
|
||||
|
||||
// 4 JH
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash0, hash0, 64 );
|
||||
shavite512_full( &ctx.shavite, hash1, hash1, 64 );
|
||||
shavite512_full( &ctx.shavite, hash2, hash2, 64 );
|
||||
shavite512_full( &ctx.shavite, hash3, hash3, 64 );
|
||||
shavite512_full( &ctx.shavite, hash4, hash4, 64 );
|
||||
shavite512_full( &ctx.shavite, hash5, hash5, 64 );
|
||||
shavite512_full( &ctx.shavite, hash6, hash6, 64 );
|
||||
shavite512_full( &ctx.shavite, hash7, hash7, 64 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
@@ -212,29 +156,22 @@ void c11_8way_hash( void *state, const void *input )
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash4, 512,
|
||||
(const BitSequence *)hash4, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash5, 512,
|
||||
(const BitSequence *)hash5, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash6, 512,
|
||||
(const BitSequence *)hash6, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash7, 512,
|
||||
(const BitSequence *)hash7, 64 );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -246,225 +183,223 @@ void c11_8way_hash( void *state, const void *input )
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m512i eight = m512_const1_64( 8 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
max_nonce -= 8;
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, c11_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
c11_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( ( hash+(i<<3) )[7] <= Htarg )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
do
|
||||
{
|
||||
if ( likely( c11_8way_hash( hash, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
|
||||
&& valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash + ( lane << 3 ), mythr );
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (C11_4WAY)
|
||||
|
||||
typedef struct {
|
||||
union _c11_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo512_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
} c11_4way_ctx_holder;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
};
|
||||
typedef union _c11_4way_context_overlay c11_4way_context_overlay;
|
||||
|
||||
c11_4way_ctx_holder c11_4way_ctx;
|
||||
static __thread __m256i c11_4way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
|
||||
|
||||
void init_c11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &c11_4way_ctx.blake );
|
||||
bmw512_4way_init( &c11_4way_ctx.bmw );
|
||||
init_groestl( &c11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
keccak512_4way_init( &c11_4way_ctx.keccak );
|
||||
luffa_2way_init( &c11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_4way_ctx.shavite );
|
||||
simd_2way_init( &c11_4way_ctx.simd, 512 );
|
||||
init_echo( &c11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void c11_4way_hash( void *state, const void *input )
|
||||
int c11_4way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashA[8*2] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
|
||||
c11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
c11_4way_context_overlay ctx;
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
|
||||
c11_4way_midstate );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
// Serial
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
// 4way
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
// 4 JH
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
// Serial
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
luffa_2way_init( &ctx.luffa, 512 );
|
||||
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
// 10 Simd
|
||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||
intrlv_2x128( vhashB, hash2, hash3, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, 512 );
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
__m128i edata[5] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m256i four = m256_const1_64( 4 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
|
||||
c11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
|
||||
0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, c11_4way_midstate, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( ( ( hash+(i<<3) )[7] <= Htarg )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
do
|
||||
{
|
||||
if ( likely( c11_4way_hash( hash, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash + ( lane << 3 ) )[7] <= targ32_d7 )
|
||||
&& valid_hash( hash +( lane << 3 ), ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, hash + ( lane << 3 ), mythr );
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -3,11 +3,9 @@
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (C11_8WAY)
|
||||
init_c11_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_8way;
|
||||
gate->hash = (void*)&c11_8way_hash;
|
||||
#elif defined (C11_4WAY)
|
||||
init_c11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||
gate->hash = (void*)&c11_4way_hash;
|
||||
#else
|
||||
|
@@ -14,14 +14,14 @@
|
||||
bool register_c11_algo( algo_gate_t* gate );
|
||||
#if defined(C11_8WAY)
|
||||
|
||||
void c11_8way_hash( void *state, const void *input );
|
||||
int c11_8way_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_c11_8way_ctx();
|
||||
//void init_c11_8way_ctx();
|
||||
|
||||
#elif defined(C11_4WAY)
|
||||
|
||||
void c11_4way_hash( void *state, const void *input );
|
||||
int c11_4way_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_c11_4way_ctx();
|
||||
|
@@ -163,7 +163,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
|
||||
{
|
||||
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||
size<<3 );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||
}
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
|
@@ -31,7 +31,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
@@ -85,7 +85,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
|
||||
s_ntime = masked_ntime;
|
||||
if ( !thr_id )
|
||||
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
|
||||
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
|
||||
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
|
||||
}
|
||||
|
||||
|
@@ -264,10 +264,8 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
|
||||
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
@@ -279,7 +277,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
@@ -291,8 +289,6 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#elif defined(X17_4WAY)
|
||||
|
||||
union _x17_4way_context_overlay
|
||||
@@ -322,6 +318,9 @@ union _x17_4way_context_overlay
|
||||
};
|
||||
typedef union _x17_4way_context_overlay x17_4way_context_overlay;
|
||||
|
||||
static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
|
||||
|
||||
int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
@@ -333,7 +332,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
uint64_t hash3[8] __attribute__ ((aligned (32)));
|
||||
x17_4way_context_overlay ctx;
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
|
||||
x17_4way_midstate );
|
||||
|
||||
// blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
@@ -449,4 +451,54 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i edata[5] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *hash32_d7 = &(hash32[7*4]);
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const __m256i four = m256_const1_64( 4 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
|
||||
|
||||
do
|
||||
{
|
||||
if ( likely( x17_4way_hash( hash32, vdata, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -6,7 +6,8 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x17_8way;
|
||||
gate->hash = (void*)&x17_8way_hash;
|
||||
#elif defined (X17_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
||||
gate->scanhash = (void*)&scanhash_x17_4way;
|
||||
// gate->scanhash = (void*)&scanhash_4way_64in_32out;
|
||||
gate->hash = (void*)&x17_4way_hash;
|
||||
#else
|
||||
gate->hash = (void*)&x17_hash;
|
||||
|
@@ -581,10 +581,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
|
||||
*noncev = mm512_intrlv_blend_32( *noncev,
|
||||
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
|
||||
0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
|
||||
0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
|
||||
|
||||
do
|
||||
@@ -941,9 +939,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm256_intrlv80_4x64( vdata, edata );
|
||||
|
||||
*noncev = mm256_intrlv_blend_32( *noncev,
|
||||
_mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
|
||||
0, 3, 0, 2, 0, 1, 0, 0 ) );
|
||||
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
|
||||
|
||||
do
|
||||
|
@@ -4,18 +4,39 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null
|
||||
|
||||
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl
|
||||
# Rocketlake needs gcc-11
|
||||
#CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha-vaes
|
||||
|
||||
# Zen4 AVX512 SHA VAES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# znver3 needs gcc-11, znver4 ?
|
||||
#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||
CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen4
|
||||
|
||||
# Zen3 AVX2 SHA VAES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen3
|
||||
|
||||
# AVX512 AES: Intel Core HEDT Sylake-X, Cascadelake
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
|
@@ -2,8 +2,8 @@
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe > /dev/null
|
||||
|
||||
make distclean > /dev/null
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.3.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.20.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.20.2'
|
||||
PACKAGE_VERSION='3.20.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.20.3'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.20.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.20.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.20.2
|
||||
cpuminer-opt configure 3.20.3
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.20.2, which was
|
||||
It was created by cpuminer-opt $as_me 3.20.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.20.2'
|
||||
VERSION='3.20.3'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.20.2, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.20.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6784,7 +6784,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.20.2
|
||||
cpuminer-opt config.status 3.20.3
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.20.2])
|
||||
AC_INIT([cpuminer-opt], [3.20.3])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
24
cpu-miner.c
24
cpu-miner.c
@@ -390,11 +390,11 @@ bool std_le_work_decode( struct work *work )
|
||||
{
|
||||
int i;
|
||||
const int adata_sz = algo_gate.get_work_data_size() / 4;
|
||||
const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
// const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
|
||||
for ( i = 0; i < adata_sz; i++ )
|
||||
work->data[i] = le32dec( work->data + i );
|
||||
for ( i = 0; i < atarget_sz; i++ )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
work->target[i] = le32dec( work->target + i );
|
||||
return true;
|
||||
}
|
||||
@@ -403,11 +403,11 @@ bool std_be_work_decode( struct work *work )
|
||||
{
|
||||
int i;
|
||||
const int adata_sz = algo_gate.get_work_data_size() / 4;
|
||||
const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
// const int atarget_sz = ARRAY_SIZE(work->target);
|
||||
|
||||
for ( i = 0; i < adata_sz; i++ )
|
||||
work->data[i] = be32dec( work->data + i );
|
||||
for ( i = 0; i < atarget_sz; i++ )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
work->target[i] = le32dec( work->target + i );
|
||||
return true;
|
||||
}
|
||||
@@ -518,11 +518,10 @@ static bool get_mininginfo( CURL *curl, struct work *work )
|
||||
|
||||
static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
{
|
||||
int i, n;
|
||||
uint32_t prevhash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t target[8] __attribute__ ((aligned (32)));
|
||||
unsigned char final_sapling_hash[32] __attribute__ ((aligned (32)));
|
||||
uint32_t version, curtime, bits;
|
||||
uint32_t prevhash[8];
|
||||
uint32_t target[8];
|
||||
unsigned char final_sapling_hash[32];
|
||||
int cbtx_size;
|
||||
uchar *cbtx = NULL;
|
||||
int tx_count, tx_size;
|
||||
@@ -534,6 +533,7 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
bool version_reduce = false;
|
||||
json_t *tmp, *txa;
|
||||
bool rc = false;
|
||||
int i, n;
|
||||
|
||||
// Segwit BEGIN
|
||||
bool segwit = false;
|
||||
@@ -898,7 +898,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
applog( LOG_ERR, "JSON invalid target" );
|
||||
goto out;
|
||||
}
|
||||
for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
work->target[7 - i] = be32dec( target + i );
|
||||
net_diff = work->targetdiff = hash_to_diff( work->target );
|
||||
|
||||
@@ -1459,6 +1460,7 @@ char* std_malloc_txs_request( struct work *work )
|
||||
json_t *val;
|
||||
char data_str[2 * sizeof(work->data) + 1];
|
||||
int i;
|
||||
// datasize is an ugly hack, it should go through the gate
|
||||
int datasize = work->sapling ? 112 : 80;
|
||||
|
||||
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
|
||||
@@ -2163,7 +2165,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
char block_ttf[32];
|
||||
char share_ttf[32];
|
||||
|
||||
sprintf_et( block_ttf, nd / hr );
|
||||
sprintf_et( block_ttf, nd / hr );
|
||||
sprintf_et( share_ttf, ( g_work->targetdiff * exp32 ) / hr );
|
||||
scale_hash_for_display ( &hr, hr_units );
|
||||
applog2( LOG_INFO, "TTF @ %.2f %sh/s: Block %s, Share %s",
|
||||
@@ -3992,7 +3994,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize stats times and counters
|
||||
// Initialize stats timers and counters
|
||||
memset( share_stats, 0, s_stats_size * sizeof (struct share_stats_t) );
|
||||
gettimeofday( &last_submit_time, NULL );
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
|
25
simd-utils.h
25
simd-utils.h
@@ -57,10 +57,15 @@
|
||||
// 32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte
|
||||
// alignment is recommended in all cases for best cache alignment.
|
||||
//
|
||||
// All functions are defined with type agnostic pointers (void*) arguments
|
||||
// and are cast or aliased as the appropriate type. This adds convenience
|
||||
// for the applications but also adds responsibility to ensure adequate data
|
||||
// alignment.
|
||||
//
|
||||
// Windows has problems with function vector arguments larger than
|
||||
// 128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
|
||||
// pointers for larger vectors in function arguments. Macros can be
|
||||
// used for larger value arguments.
|
||||
// pointers for larger vectors in function arguments. Macros can be used
|
||||
// for larger value arguments.
|
||||
//
|
||||
// An attempt was made to make the names as similar as possible to
|
||||
// Intel's intrinsic function format. Most variations are to avoid
|
||||
@@ -74,7 +79,7 @@
|
||||
// to avoid the ambiguity of "mm".
|
||||
// - the element size does not include additional type specifiers
|
||||
// like "epi".
|
||||
// - some macros contain value args that are updated.
|
||||
// - some macros may contain value args that are updated.
|
||||
// - specialized shift and rotate functions that move elements around
|
||||
// use the notation "1x32" to indicate the distance moved as units of
|
||||
// the element size.
|
||||
@@ -86,10 +91,10 @@
|
||||
//
|
||||
// Function names follow this pattern:
|
||||
//
|
||||
// prefix_op[esize]_[vsize]
|
||||
// prefix_op[vsize]_[esize]
|
||||
//
|
||||
// Prefix: usually the size of the largest vectors used. Following
|
||||
// are some examples:
|
||||
// Prefix: usually the size of the returned vector.
|
||||
// Following are some examples:
|
||||
//
|
||||
// u64: unsigned 64 bit integer function
|
||||
// i128: signed 128 bit integer function (rarely used)
|
||||
@@ -102,10 +107,12 @@
|
||||
// esize: optional, element size of operation
|
||||
//
|
||||
// vsize: optional, lane size used when a function operates on elements
|
||||
// of vectors within lanes of a vector.
|
||||
// within lanes of a larger vector.
|
||||
//
|
||||
// Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
|
||||
// right by 64 bits.
|
||||
// m256_const_64 defines a vector contructed from the supplied 64 bit
|
||||
// integer arguments.
|
||||
// mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
|
||||
// right by 32 bits.
|
||||
//
|
||||
// Vector constants
|
||||
//
|
||||
|
@@ -302,6 +302,44 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
//
|
||||
// Extended bit shift for concatenated packed elements from 2 vectors.
|
||||
// Shift right returns low half, shift left return high half.
|
||||
|
||||
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
|
||||
|
||||
#define mm128_shl2_64( v1, v2, c ) _mm_shldi_epi64( v1, v2, c )
|
||||
#define mm128_shr2_64( v1, v2, c ) _mm_shrdi_epi64( v1, v2, c )
|
||||
|
||||
#define mm128_shl2_32( v1, v2, c ) _mm_shldi_epi32( v1, v2, c )
|
||||
#define mm128_shr2_32( v1, v2, c ) _mm_shrdi_epi32( v1, v2, c )
|
||||
|
||||
#define mm128_shl2_16( v1, v2, c ) _mm_shldi_epi16( v1, v2, c )
|
||||
#define mm128_shr2_16( v1, v2, c ) _mm_shrdi_epi16( v1, v2, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_shl2_64( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v1, c ), _mm_srli_epi64( v2, 64 - (c) ) )
|
||||
|
||||
#define mm128_shr2_64( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v2, c ), _mm_slli_epi64( v1, 64 - (c) ) )
|
||||
|
||||
#define mm128_shl2_32( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v1, c ), _mm_srli_epi32( v2, 32 - (c) ) )
|
||||
|
||||
#define mm128_shr2_32( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v2, c ), _mm_slli_epi32( v1, 32 - (c) ) )
|
||||
|
||||
#define mm128_shl2_16( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v1, c ), _mm_srli_epi16( v2, 16 - (c) ) )
|
||||
|
||||
#define mm128_shr2_16( v1, v2, c ) \
|
||||
_mm_or_si128( _mm_srli_epi16( v2, c ), _mm_slli_epi16( v1, 16 - (c) ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
@@ -402,13 +440,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from src a, and the high half from src b.
|
||||
#define mm128_shuffle2_64( a, b, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
||||
_mm_castsi128_pd( b ), c ) );
|
||||
#define mm128_shuffle2_64( v1, v2, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
|
||||
_mm_castsi128_pd( v2 ), c ) );
|
||||
|
||||
#define mm128_shuffle2_32( a, b, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
|
||||
_mm_castsi128_ps( b ), c ) );
|
||||
#define mm128_shuffle2_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
@@ -574,30 +612,68 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
|
||||
// Two input shuffle-rotate.
|
||||
// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Concatenate v1 & v2 and byte rotate as a 256 bit vector.
|
||||
// Function macros with two inputs and one output, inputs are preserved.
|
||||
// Returns the high 128 bits, ie updated v1.
|
||||
// These functions are preferred but only available with SSSE3. Use procedure
|
||||
// macros below for SSE2 compatibility.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
/*
|
||||
#define mm128_shufl2r_32( v1, v2 ) _mm_alignr_epi8( v2, v1, 4 )
|
||||
#define mm128_shufl2l_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) _mm_alignr_epi8( v2, v1, 2 )
|
||||
#define mm128_shufl2l_16( v1, v2 ) _mm_alignr_epi8( v1, v2, 2 )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 1 )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 1 )
|
||||
*/
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) )
|
||||
|
||||
#define mm128_shufl2l_64( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) )
|
||||
/*
|
||||
#define mm128_shufl2r_32( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) )
|
||||
|
||||
#define mm128_shufl2l_32( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) )
|
||||
|
||||
#define mm128_shufl2r_16( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) )
|
||||
|
||||
#define mm128_shufl2l_16( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) )
|
||||
|
||||
#define mm128_shufl2r_8( v1, v2 ) \
|
||||
_mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) )
|
||||
|
||||
#define mm128_shufl2l_8( v1, v2 ) \
|
||||
_mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) )
|
||||
*/
|
||||
#endif
|
||||
|
||||
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
|
||||
// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
|
||||
// compatibility with with existing code.
|
||||
// vrol & vror are deprecated and do not exist for larger vectors.
|
||||
// Their only use is by lyra2 blake2b when AVX2 is not available and is
|
||||
// grandfathered.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
@@ -613,6 +689,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||
@@ -654,6 +731,7 @@ do { \
|
||||
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#else // SSE2
|
||||
|
||||
@@ -674,7 +752,7 @@ do { \
|
||||
_mm_srli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define mm128_vror256_32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
@@ -728,7 +806,7 @@ do { \
|
||||
_mm_srli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
*/
|
||||
#endif // SSE4.1 else SSE2
|
||||
|
||||
#endif // __SSE2__
|
||||
|
@@ -1,30 +1,28 @@
|
||||
#if !defined(SIMD_256_H__)
|
||||
#define SIMD_256_H__ 1
|
||||
|
||||
//#if defined(__AVX2__)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX2 256 bit vectors
|
||||
//
|
||||
// Basic support for 256 bit vectors is available with AVX but integer
|
||||
// support requires AVX2.
|
||||
// Some 256 bit vector utilities require AVX512 or have more efficient
|
||||
// AVX512 implementations. They will be selected automatically but their use
|
||||
// is limited because 256 bit vectors are less likely to be used when 512
|
||||
// is available.
|
||||
//
|
||||
// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
|
||||
// version is not. Some usage has the index vector encoded as if full vector
|
||||
// AVX512VL backports some AVX512 features to 256 bit vectors and can produce
|
||||
// more efficient implementations of some functions. They will be selected
|
||||
// automatically but their use is limited because 256 bit vectors are less
|
||||
// likely to be used when 512 is available.
|
||||
//
|
||||
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
|
||||
// lanes and data can't cross the 128 bit lane boundary.
|
||||
// Some usage may have the index vector encoded as if full vector
|
||||
// shuffles are supported. This has no side effects and would have the same
|
||||
// results using either version.
|
||||
// If needed and AVX512 is available, 256 bit full vector shuffles can be
|
||||
// implemented using the AVX512 zero-mask feature with a NULL mask.
|
||||
// Using intrinsics it's simple:
|
||||
// _mm256_maskz_shuffle_epi8( k0, v, c )
|
||||
// If the need arises and AVX512VL is available, 256 bit full vector shuffles
|
||||
// can be implemented using the AVX512 zero-mask feature with a NULL mask.
|
||||
// Using intrinsics it's simple: _mm256_maskz_shuffle_epi8( 0, v, c )
|
||||
// With asm it's a bit more complicated with the addition of the mask register
|
||||
// and zero tag:
|
||||
// vpshufb ymm0{k0}{z}, ymm1, ymm2
|
||||
// and zero tag: vpshufb ymm0{k0}{z}, ymm1, ymm2
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -57,8 +55,8 @@ typedef union
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Move integer to low element of vector, other elements are set to zero.
|
||||
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
|
||||
@@ -72,7 +70,6 @@ typedef union
|
||||
//#define mm256_mov256_64 u64_mov256_64
|
||||
//#define mm256_mov256_32 u32_mov256_32
|
||||
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
|
||||
@@ -145,8 +142,17 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// Bitwise not ( ~v )
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_not( const __m256i v )
|
||||
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_not( v ) _mm256_xor_si256( v, m256_neg1 ) \
|
||||
|
||||
#endif
|
||||
|
||||
// Unary negation of each element ( -v )
|
||||
#define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v )
|
||||
#define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v )
|
||||
@@ -281,6 +287,50 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
_mm256_blend_epi32( v3, v2, 0x44) \
|
||||
_mm256_blend_epi32( v1, v0, 0x11 ) )
|
||||
|
||||
/*
|
||||
//
|
||||
// Extended bit shift for concatenated packed elements from 2 vectors.
|
||||
// Shift right returns low half, shift left return high half.
|
||||
|
||||
#if defined(__AVX512VBMI2__) && defined(__AVX512VL__)
|
||||
|
||||
#define mm256_shl2_64( v1, v2, c ) _mm256_shldi_epi64( v1, v2, c )
|
||||
#define mm256_shr2_64( v1, v2, c ) _mm256_shrdi_epi64( v1, v2, c )
|
||||
|
||||
#define mm256_shl2_32( v1, v2, c ) _mm256_shldi_epi32( v1, v2, c )
|
||||
#define mm256_shr2_32( v1, v2, c ) _mm256_shrdi_epi32( v1, v2, c )
|
||||
|
||||
#define mm256_shl2_16( v1, v2, c ) _mm256_shldi_epi16( v1, v2, c )
|
||||
#define mm256_shr2_16( v1, v2, c ) _mm256_shrdi_epi16( v1, v2, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_shl2i_64( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64( v1, c ), \
|
||||
_mm256_srli_epi64( v2, 64 - (c) ) )
|
||||
|
||||
#define mm512_shr2_64( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v2, c ), \
|
||||
_mm256_slli_epi64( v1, 64 - (c) ) )
|
||||
|
||||
#define mm256_shl2_32( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v1, c ), \
|
||||
_mm256_srli_epi32( v2, 32 - (c) ) )
|
||||
|
||||
#define mm256_shr2_32( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32( v2, c ), \
|
||||
_mm256_slli_epi32( v1, 32 - (c) ) )
|
||||
|
||||
#define mm256_shl2_16( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi16( v1, c ), \
|
||||
_mm256_srli_epi16( v2, 16 - (c) ) )
|
||||
|
||||
#define mm256_shr2_16( v1, v2, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi16( v2, c ), \
|
||||
_mm256_slli_epi16( v1, 16 - (c) ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
@@ -414,13 +464,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
// Limited 2 input shuffle
|
||||
#define mm256_shuffle2_64( a, b, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
|
||||
_mm256_castsi256_pd( b ), c ) );
|
||||
#define mm256_shuffle2_64( v1, v2, c ) \
|
||||
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
|
||||
_mm256_castsi256_pd( v2 ), c ) );
|
||||
|
||||
#define mm256_shuffle2_32( a, b, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
|
||||
_mm256_castsi256_ps( b ), c ) );
|
||||
#define mm256_shuffle2_32( v1, v2, c ) \
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
|
||||
_mm256_castsi256_ps( v2 ), c ) );
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
|
@@ -2,42 +2,49 @@
|
||||
#define SIMD_512_H__ 1
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX-512
|
||||
// AVX512 512 bit vectors
|
||||
//
|
||||
// The baseline for these utilities is AVX512F, AVX512DQ, AVX512BW
|
||||
// and AVX512VL, first available in quantity in Skylake-X.
|
||||
// Some utilities may require additional features available in subsequent
|
||||
// architectures and are noted.
|
||||
|
||||
// Some utilities may require additional AVX512 extensions available in
|
||||
// subsequent architectures and are noted where used.
|
||||
// AVX512VL is used to backport AVX512 instructions to 128 and 256 bit
|
||||
// vectors. It is therefore not technically required for any 512 bit vector
|
||||
// utilities defined below.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// AVX512 intrinsics have a few changes from previous conventions.
|
||||
//
|
||||
// cmp instruction now returns a bitmask instead of a vector mask.
|
||||
// This eliminates the need for the blendv instruction.
|
||||
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
|
||||
// This removes the need for an explicit movemask instruction.
|
||||
//
|
||||
// The new rotate instructions require the count to be an 8 bit
|
||||
// immediate value only. Compilation fails if a variable is used.
|
||||
// The documentation is the same as for shift and it works with
|
||||
// variables. The inconsistency is likely due to compiler optimizations
|
||||
// that can eliminate the variable in some instances.
|
||||
// Many previously sizeless (si) instructions now have sized (epi) versions
|
||||
// to accomodate masking packed elements.
|
||||
//
|
||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||
// usually shuffles accross all lanes.
|
||||
// Many AVX512 instructions have a different argument order from the AVX2
|
||||
// versions of similar instructions. There is also some inconsistency in how
|
||||
// different AVX512 instructions position the mask register in the argument
|
||||
// list.
|
||||
//
|
||||
// permutexvar has args reversed, index is first arg. Previously all
|
||||
// permutes and shuffles have the index last.
|
||||
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
|
||||
// AVX512 permutes can cross all lanes.
|
||||
//
|
||||
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
|
||||
// It also performs the same op as _mm512_shuffle_epi8.
|
||||
// "_mm512_shuffle_epi8" shuffles accross the entire 512 bits. Shuffle
|
||||
// instructions generally don't cross 128 bit lane boundaries and the AVX2
|
||||
// version of this specific instruction does not.
|
||||
//
|
||||
// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||
// doesn't cross 128 bit lane boundaries but is consistent with AVX2
|
||||
// where shuffle_epi8 spans the entire vector.
|
||||
// New alignr instructions for epi64 and epi32 operate across the entire
|
||||
// vector. "_mm512_alignr_epi8" continues to be restricted to 128 bit lanes.
|
||||
//
|
||||
// There are 2 areas where overhead is aconcern: constants and
|
||||
// "_mm512_permutexvar_epi8" and "_mm512_permutex2var_epi8" require
|
||||
// AVX512-VBMI. The same instructions with larger elements don't have this
|
||||
// requirement. "_mm512_permutexvar_epi8" also performs the same operation
|
||||
// as "_mm512_shuffle_epi8" which only requires AVX512-BW.
|
||||
//
|
||||
// There are 2 areas where overhead is a major concern: constants and
|
||||
// permutations.
|
||||
//
|
||||
// Constants need to be composed at run time by assembling individual
|
||||
@@ -60,13 +67,10 @@
|
||||
// The same rules apply, if an index is to be reused it should be defined
|
||||
// as a local. This applies specifically to bswap operations.
|
||||
//
|
||||
// Additionally, permutations using smaller vectors can be more efficient
|
||||
// if the permutation doesn't cross lane boundaries, typically 128 bits,
|
||||
// and the smaller vector can use an imm comtrol.
|
||||
//
|
||||
// If the permutation doesn't cross lane boundaries a shuffle instructions
|
||||
// can be used with imm control instead of permute.
|
||||
|
||||
// Permutations that cross 128 bit lanes are typically slower and often need
|
||||
// a vector control index. If the permutation doesn't need to cross 128 bit
|
||||
// lanes a shuffle instruction can often be used with an imm control.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
// AVX512 512 bit vectors
|
||||
@@ -179,12 +183,12 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// ~x
|
||||
// Bitwise NOT: ~x
|
||||
// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
static inline __m512i mm512_not( const __m512i x )
|
||||
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
|
||||
|
||||
// -x
|
||||
// Unary negation: -x
|
||||
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
|
||||
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
|
||||
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
|
||||
@@ -269,7 +273,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_xoror( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c ) xor( a, andnot( b, c ) )
|
||||
// a ^ ( ~b & c ), xor( a, andnot( b, c ) )
|
||||
#define mm512_xorandnot( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
@@ -310,8 +314,50 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
_mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
|
||||
_mm512_mask_blend_epi32( 0x1111, v1, v0 ) )
|
||||
|
||||
/*
|
||||
//
|
||||
// Extended bit shift of concatenated packed elements from 2 vectors.
|
||||
// Shift right returns low half, shift left returns high half.
|
||||
|
||||
#if defined(__AVX512VBMI2__)
|
||||
|
||||
#define mm512_shl2_64( v1, v2, c ) _mm512_shldi_epi64( v1, v2, c )
|
||||
#define mm512_shr2_64( v1, v2, c ) _mm512_shrdi_epi64( v1, v2, c )
|
||||
|
||||
#define mm512_shl2_32( v1, v2, c ) _mm512_shldi_epi32( v1, v2, c )
|
||||
#define mm512_shr2_32( v1, v2, c ) _mm512_shrdi_epi32( v1, v2, c )
|
||||
|
||||
#define mm512_shl2_16( v1, v2, c ) _mm512_shldi_epi16( v1, v2, c )
|
||||
#define mm512_shr2_16( v1, v2, c ) _mm512_shrdi_epi16( v1, v2, c )
|
||||
|
||||
#else
|
||||
|
||||
#define mm512_shl2_64( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_slli_epi64( v1, c ), \
|
||||
_mm512_srli_epi64( v2, 64 - (c) ) )
|
||||
|
||||
#define mm512_shr2_64( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_srli_epi64( v2, c ), \
|
||||
_mm512_slli_epi64( v1, 64 - (c) ) )
|
||||
|
||||
#define mm512_shl2_32( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_slli_epi32( v1, c ), \
|
||||
_mm512_srli_epi32( v2, 32 - (c) ) )
|
||||
|
||||
#define mm512_shr2_32( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_srli_epi32( v2, c ), \
|
||||
_mm512_slli_epi32( v1, 32 - (c) ) )
|
||||
|
||||
#define mm512_shl2_16( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_slli_epi16( v1, c ), \
|
||||
_mm512_srli_epi16( v2, 16 - (c) ) )
|
||||
|
||||
#define mm512_shr2_16( v1, v2, c ) \
|
||||
_mm512_or_si512( _mm512_srli_epi16( v2, c ), \
|
||||
_mm512_slli_epi16( v1, 16 - (c) ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
// Bit rotations.
|
||||
|
||||
@@ -328,14 +374,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_ror_32 _mm512_ror_epi32
|
||||
#define mm512_rol_32 _mm512_rol_epi32
|
||||
|
||||
// Rotations using a vector control index are very slow due to overhead
|
||||
// to generate the index vector. Repeated rotations using the same index
|
||||
// are better handled by the calling function where the index only needs
|
||||
// to be generated once then reused very efficiently.
|
||||
// Permutes and shuffles using an immediate index are significantly faster.
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements, vectorized endian conversion.
|
||||
// Reverse byte order of packed elements, vectorized endian conversion.
|
||||
|
||||
#define mm512_bswap_64( v ) \
|
||||
_mm512_shuffle_epi8( v, \
|
||||
@@ -394,7 +434,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
} while(0)
|
||||
|
||||
|
||||
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
|
||||
// Cross-lane shuffles implementing rotate & shift of packed elements.
|
||||
//
|
||||
|
||||
#define mm512_shiftr_256( v ) \
|
||||
@@ -537,14 +577,14 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source.
|
||||
#define mm512_shuffle2_64( a, b, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
||||
_mm512_castsi512_pd( b ), c ) );
|
||||
// destination elements must come from a specific source arg.
|
||||
#define mm512_shuffle2_64( v1, v2, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( v1 ), \
|
||||
_mm512_castsi512_pd( v2 ), c ) );
|
||||
|
||||
#define mm512_shuffle2_32( a, b, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
|
||||
_mm512_castsi512_ps( b ), c ) );
|
||||
#define mm512_shuffle2_32( v1, v2, c ) \
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
@@ -583,9 +623,9 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
|
||||
/*
|
||||
// 2 input, 1 output
|
||||
// Concatenate { v1, v2 ) then rotate right or left and return the high
|
||||
// Concatenate { v1, v2 } then rotate right or left and return the high
|
||||
// 512 bits, ie rotated v1.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
@@ -598,6 +638,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
|
||||
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
|
||||
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
|
||||
*/
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
@@ -333,7 +333,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
|
||||
// CPU_INFO ECX
|
||||
#define SSE3_Flag 1
|
||||
#define SSSE3_Flag (1<< 9)
|
||||
#define XOP_Flag (1<<11)
|
||||
#define XOP_Flag (1<<11) // obsolete, only available on pre-Ryzen AMD
|
||||
#define FMA3_Flag (1<<12)
|
||||
#define AES_Flag (1<<25)
|
||||
#define SSE41_Flag (1<<19)
|
||||
|
6
util.c
6
util.c
@@ -1371,7 +1371,7 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
|
||||
{
|
||||
if ( rc != CURLE_AGAIN )
|
||||
#else
|
||||
n = send(sock, s + sent, len, 0);
|
||||
n = send( sctx->sock, s + sent, len, 0);
|
||||
if ( n < 0 )
|
||||
{
|
||||
if ( !socket_blocks() )
|
||||
@@ -1379,8 +1379,8 @@ static bool send_line( struct stratum_ctx *sctx, char *s )
|
||||
return false;
|
||||
n = 0;
|
||||
}
|
||||
sent += n;
|
||||
len -= n;
|
||||
sent += n;
|
||||
len -= n;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
Reference in New Issue
Block a user