Compare commits

..

9 Commits

Author SHA1 Message Date
Jay D Dee
1321ac474c v3.20.1 2022-07-26 18:36:40 -04:00
Jay D Dee
40d07c0097 v3.20.0 2022-07-17 13:30:50 -04:00
Jay D Dee
f552f2b1e8 v3.19.9 2022-07-10 11:04:00 -04:00
Jay D Dee
26b8927632 v3.19.8 2022-05-27 18:12:30 -04:00
Jay D Dee
db76d3865f v3.19.7 2022-04-02 12:44:57 -04:00
Jay D Dee
5b678d2481 v3.19.6 2022-02-21 23:14:24 -05:00
Jay D Dee
90137b391e v3.19.5 2022-01-30 20:59:54 -05:00
Jay D Dee
8727d79182 v3.19.4 2022-01-12 21:08:25 -05:00
Jay D Dee
17ccbc328f v3.19.3 2022-01-07 12:07:38 -05:00
80 changed files with 5017 additions and 4114 deletions

View File

@@ -21,6 +21,7 @@ cpuminer_SOURCES = \
api.c \
sysinfos.c \
algo-gate-api.c\
malloc-huge.c \
algo/argon2/argon2a/argon2a.c \
algo/argon2/argon2a/ar2/argon2.c \
algo/argon2/argon2a/ar2/opt.c \
@@ -288,7 +289,7 @@ cpuminer_SOURCES = \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/crypto/hmac-blake2b.c \
algo/yespower/yescrypt-r8g.c \
algo/yespower/yespower-opt.c

View File

@@ -22,7 +22,7 @@ required.
Compile Instructions
--------------------
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
@@ -65,6 +65,75 @@ If not what makes it happen or not happen?
Change Log
----------
v3.20.1
sph_blake2b optimized 1-way SSSE3 & AVX2.
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
Removed imprecise hash & target display from rejected share log.
Share and target difficulty is now displayed only for low diificulty shares.
Updated configure.ac to check for AVX512 asm support.
Small optimization to Lyra2 SSE2.
v3.20.0
#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
v3.19.9
More Blake256, Blake512, Luffa & Cubehash prehash optimizations.
Relaxed some excessively strict data alignment that was negatively affecting performance.
v3.19.8
#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid
url protocol specifier for requesting a secure stratum connection.
The full url, including the protocol, is now displayed in the stratum connect
log and the periodic summary log.
Small optimizations to Cubehash, AVX2 & AVX512.
Byte order and prehash optimizations for Blake256 & Blake512, AVX2 & AVX512.
v3.19.7
#369 Fixed time limited mining, --time-limit.
Fixed a potential compile error when using optimization below -O3.
v3.19.6
#363 Fixed a stratum bug where the first job may be ignored delaying start of hashing
Fixed handling of nonce exhaust when hashing a fast algo with extranonce disabled
Small optimization to Shavite.
v3.19.5
Enhanced stratum-keepalive preemptively resets the stratum connection
before the server to avoid lost shares.
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
X16RT: eliminate unnecessary recalculations of the hash order.
Fix a few compiler warnings.
Fixed log colour error when a block is solved.
v3.19.4
#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
New option stratum-keepalive prevents stratum timeouts when no shares are
submitted for several minutes due to high difficulty.
Fixed a bug displaying optimizations for some algos.
v3.19.3
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
v3.19.2
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.

View File

@@ -344,7 +344,7 @@ static size_t
detect_cpu(void) {
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
//cpu_vendors_x86 vendor = cpu_nobody;
x86_regs regs;
x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
uint32_t max_level, max_ext_level;
size_t cpu_flags = 0;
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
@@ -460,4 +460,4 @@ get_top_cpuflag_desc(size_t flag) {
#endif
#endif
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */

View File

@@ -4,11 +4,12 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
#endif
/* romix pre/post nop function */
/*
static void asm_calling_convention
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
(void)blocks; (void)nblocks;
}
*/
/* romix pre/post endian conversion function */
static void asm_calling_convention
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {

View File

@@ -37,6 +37,13 @@
#if defined(__AVX512F__)
static inline __m512i blamka( __m512i x, __m512i y )
{
__m512i xy = _mm512_mul_epu32( x, y );
return _mm512_add_epi64( _mm512_add_epi64( x, y ),
_mm512_add_epi64( xy, xy ) );
}
static void fill_block( __m512i *state, const block *ref_block,
block *next_block, int with_xor )
{

View File

@@ -328,9 +328,7 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define ROR64(x, n) _mm512_ror_epi64((x), (n))
static __m512i muladd(__m512i x, __m512i y)
static inline __m512i muladd(__m512i x, __m512i y)
{
__m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
@@ -344,8 +342,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ROR64(D0, 32); \
D1 = ROR64(D1, 32); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -353,8 +351,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ROR64(B0, 24); \
B1 = ROR64(B1, 24); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -365,8 +363,8 @@ static __m512i muladd(__m512i x, __m512i y)
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
\
D0 = ROR64(D0, 16); \
D1 = ROR64(D1, 16); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
\
C0 = muladd(C0, D0); \
C1 = muladd(C1, D1); \
@@ -374,8 +372,8 @@ static __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
\
B0 = ROR64(B0, 63); \
B1 = ROR64(B1, 63); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -417,11 +415,10 @@ static __m512i muladd(__m512i x, __m512i y)
#define SWAP_HALVES(A0, A1) \
do { \
__m512i t0, t1; \
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t0; \
A1 = t1; \
__m512i t; \
t = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
A1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
A0 = t; \
} while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \

View File

@@ -49,6 +49,20 @@ extern "C"{
#define SPH_SIZE_blake512 512
/////////////////////////
//
// Blake-256 1 way SSE2
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 );
/////////////////////////
//
// Blake-512 1 way SSE2
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
//////////////////////////
//
// Blake-256 4 way SSE2
@@ -98,6 +112,12 @@ typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
@@ -128,6 +148,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -148,6 +174,14 @@ typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
@@ -180,7 +214,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_hash_le80( void *hash, const void *data );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2

View File

@@ -5,6 +5,7 @@
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
* 2016-2022 JayDDee246@gmail.com
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@@ -304,6 +305,98 @@ static const sph_u32 CS[16] = {
#endif
/////////////////////////////////////////
//
// Blake-256 1 way SIMD
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shufll_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shuflr_32( V1 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 16 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
V3 = mm128_ror_32( _mm_xor_si128( V3, V0 ), 8 ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shuflr_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shufll_32( V1 ); \
}
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 )
{
__m128i V0, V1, V2, V3;
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m128i( H, 0 );
V1 = casti_m128i( H, 1 );
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
M0 = buf[ 0];
M1 = buf[ 1];
M2 = buf[ 2];
M3 = buf[ 3];
M4 = buf[ 4];
M5 = buf[ 5];
M6 = buf[ 6];
M7 = buf[ 7];
M8 = buf[ 8];
M9 = buf[ 9];
MA = buf[10];
MB = buf[11];
MC = buf[12];
MD = buf[13];
ME = buf[14];
MF = buf[15];
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
BLAKE256_ROUND( 4 );
BLAKE256_ROUND( 5 );
BLAKE256_ROUND( 6 );
BLAKE256_ROUND( 7 );
BLAKE256_ROUND( 8 );
BLAKE256_ROUND( 9 );
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
}
////////////////////////////////////////////
//
// Blake-256 4 way
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -508,14 +601,10 @@ do { \
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
VA = m128_const1_64( 0x13198A2E13198A2E ); \
VB = m128_const1_64( 0x0370734403707344 ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \
BLAKE256_4WAY_BLOCK_BSWAP32; \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
@@ -548,6 +637,8 @@ do { \
#if defined (__AVX2__)
/////////////////////////////////
//
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
@@ -626,14 +717,10 @@ do { \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
@@ -679,13 +766,247 @@ do { \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
#define COMPRESS32_8WAY_LE( rounds ) \
do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
VA = m256_const1_64( 0x13198A2E13198A2E ); \
VB = m256_const1_64( 0x0370734403707344 ); \
VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
ROUND_S_8WAY(4); \
ROUND_S_8WAY(5); \
ROUND_S_8WAY(6); \
ROUND_S_8WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_8WAY(8); \
ROUND_S_8WAY(9); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
} \
H0 = mm256_xor3( V8, V0, H0 ); \
H1 = mm256_xor3( V9, V1, H1 ); \
H2 = mm256_xor3( VA, V2, H2 ); \
H3 = mm256_xor3( VB, V3, H3 ); \
H4 = mm256_xor3( VC, V4, H4 ); \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
{
const __m256i *M = (const __m256i*)data;
__m256i *V = (__m256i*)midstate;
const __m256i *H = (const __m256i*)midhash;
V[ 0] = H[0];
V[ 1] = H[1];
V[ 2] = H[2];
V[ 3] = H[3];
V[ 4] = H[4];
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m256_const1_32( CS0 );
V[ 9] = m256_const1_32( CS1 );
V[10] = m256_const1_32( CS2 );
V[11] = m256_const1_32( CS3 );
V[12] = m256_const1_32( CS4 ^ 0x280 );
V[13] = m256_const1_32( CS5 ^ 0x280 );
V[14] = m256_const1_32( CS6 );
V[15] = m256_const1_32( CS7 );
// G0
GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
// G1
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
_mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G4
V[ 0] = _mm256_add_epi32( V[ 0],
_mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) );
// G6
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 7] ),
_mm256_xor_si256( _mm256_set1_epi32( CSD ), M[12] ) );
// G7
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
V[ 3] = _mm256_add_epi32( V[ 3],
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
{
__m256i *H = (__m256i*)final_hash;
const __m256i *h = (const __m256i*)midhash;
const __m256i *v= (const __m256i*)midstate;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
M0 = casti_m256i( data, 0 );
M1 = casti_m256i( data, 1 );
M2 = casti_m256i( data, 2 );
M3 = casti_m256i( data, 3 );
M4 = casti_m256i( data, 4 );
M5 = casti_m256i( data, 5 );
M6 = casti_m256i( data, 6 );
M7 = casti_m256i( data, 7 );
M8 = casti_m256i( data, 8 );
M9 = casti_m256i( data, 9 );
MA = casti_m256i( data, 10 );
MB = casti_m256i( data, 11 );
MC = casti_m256i( data, 12 );
MD = casti_m256i( data, 13 );
ME = casti_m256i( data, 14 );
MF = casti_m256i( data, 15 );
// Finish round 0
// G1
V1 = _mm256_add_epi32( V1,
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
V9 = _mm256_add_epi32( V9, VD );
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
// G4
V0 = _mm256_add_epi32( V0, V5 );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
// G5
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// G6
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
// G7
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
V3 = _mm256_add_epi32( V3, V4 );
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
ROUND_S_8WAY( 4 );
ROUND_S_8WAY( 5 );
ROUND_S_8WAY( 6 );
ROUND_S_8WAY( 7 );
ROUND_S_8WAY( 8 );
ROUND_S_8WAY( 9 );
ROUND_S_8WAY( 0 );
ROUND_S_8WAY( 1 );
ROUND_S_8WAY( 2 );
ROUND_S_8WAY( 3 );
const __m256i shuf_bswap32 =
m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Blaske-256 16 way AVX512
///////////////////////////////////////
//
// Blake-256 16 way AVX512
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
@@ -763,14 +1084,10 @@ do { \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
m512_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
m512_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
m512_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
@@ -818,6 +1135,264 @@ do { \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
#define COMPRESS32_16WAY_LE( rounds ) \
do { \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
VA = m512_const1_64( 0x13198A2E13198A2E ); \
VB = m512_const1_64( 0x0370734403707344 ); \
VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \
ROUND_S_16WAY(4); \
ROUND_S_16WAY(5); \
ROUND_S_16WAY(6); \
ROUND_S_16WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_16WAY(8); \
ROUND_S_16WAY(9); \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
ROUND_S_16WAY(3); \
} \
H0 = mm512_xor3( V8, V0, H0 ); \
H1 = mm512_xor3( V9, V1, H1 ); \
H2 = mm512_xor3( VA, V2, H2 ); \
H3 = mm512_xor3( VB, V3, H3 ); \
H4 = mm512_xor3( VC, V4, H4 ); \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
// Blake-256 prehash of the second block is split onto 2 parts. The first part
// is constant for every nonce and only needs to be run once per job. The
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
{
const __m512i *M = (const __m512i*)data;
__m512i *V = (__m512i*)midstate;
const __m512i *H = (const __m512i*)midhash;
V[ 0] = H[0];
V[ 1] = H[1];
V[ 2] = H[2];
V[ 3] = H[3];
V[ 4] = H[4];
V[ 5] = H[5];
V[ 6] = H[6];
V[ 7] = H[7];
V[ 8] = m512_const1_32( CS0 );
V[ 9] = m512_const1_32( CS1 );
V[10] = m512_const1_32( CS2 );
V[11] = m512_const1_32( CS3 );
V[12] = m512_const1_32( CS4 ^ 0x280 );
V[13] = m512_const1_32( CS5 ^ 0x280 );
V[14] = m512_const1_32( CS6 );
V[15] = m512_const1_32( CS7 );
// G0
GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
// G1, nonce is in M[3]
// GS_16WAY( M[ 2], M[ 3], CS2, CS3, V1, V5, V9, VD );
V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ),
_mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) );
V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 );
V[ 9] = _mm512_add_epi32( V[ 9], V[13] );
V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] );
// G2,G3
GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
// G4
// GS_16WAY( M[ 8], M[ 9], CS8, CS9, V0, V5, VA, VF );
V[ 0] = _mm512_add_epi32( V[ 0],
_mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) );
// G5
// GS_16WAY( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
// G6
// GS_16WAY( M[12], M[13], CSC, CSD, V2, V7, V8, VD );
V[ 2] = _mm512_add_epi32( _mm512_add_epi32( V[ 2], V[ 7] ),
_mm512_xor_si512( _mm512_set1_epi32( CSD ), M[12] ) );
// G7
// GS_16WAY( M[14], M[15], CSE, CSF, V3, V4, V9, VE );
V[ 3] = _mm512_add_epi32( _mm512_add_epi32( V[ 3], V[ 4] ),
_mm512_xor_si512( _mm512_set1_epi32( CSF ), M[14] ) );
V[14] = mm512_ror_32( _mm512_xor_si512( V[14], V[ 3] ), 16 );
V[ 3] = _mm512_add_epi32( V[ 3],
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
}
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
{
__m512i *H = (__m512i*)final_hash;
const __m512i *h = (const __m512i*)midhash;
const __m512i *v= (const __m512i*)midstate;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
V0 = v[ 0];
V1 = v[ 1];
V2 = v[ 2];
V3 = v[ 3];
V4 = v[ 4];
V5 = v[ 5];
V6 = v[ 6];
V7 = v[ 7];
V8 = v[ 8];
V9 = v[ 9];
VA = v[10];
VB = v[11];
VC = v[12];
VD = v[13];
VE = v[14];
VF = v[15];
M0 = casti_m512i( data, 0 );
M1 = casti_m512i( data, 1 );
M2 = casti_m512i( data, 2 );
M3 = casti_m512i( data, 3 );
M4 = casti_m512i( data, 4 );
M5 = casti_m512i( data, 5 );
M6 = casti_m512i( data, 6 );
M7 = casti_m512i( data, 7 );
M8 = casti_m512i( data, 8 );
M9 = casti_m512i( data, 9 );
MA = casti_m512i( data, 10 );
MB = casti_m512i( data, 11 );
MC = casti_m512i( data, 12 );
MD = casti_m512i( data, 13 );
ME = casti_m512i( data, 14 );
MF = casti_m512i( data, 15 );
// Finish round 0 with the nonce (M3) now available
// G0
// GS_16WAY( M0, M1, CS0, CS1, V0, V4, V8, VC );
// G1
// GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD );
V1 = _mm512_add_epi32( V1,
_mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) );
VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 );
V9 = _mm512_add_epi32( V9, VD );
V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 );
// G2,G3
// GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE );
// GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF );
// G4
// GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF );
V0 = _mm512_add_epi32( V0, V5 );
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 );
V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5,
_mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) );
VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 );
VA = _mm512_add_epi32( VA, VF );
V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 );
// G5
GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// G6
// GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 16 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 12 );
V2 = _mm512_add_epi32( _mm512_add_epi32( V2, V7 ),
_mm512_xor_si512( _mm512_set1_epi32( CSC ), MD ) );
VD = mm512_ror_32( _mm512_xor_si512( VD, V2 ), 8 );
V8 = _mm512_add_epi32( V8, VD );
V7 = mm512_ror_32( _mm512_xor_si512( V7, V8 ), 7 );
// G7
// GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE );
V9 = _mm512_add_epi32( V9, VE );
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 12 );
V3 = _mm512_add_epi32( V3, V4 );
VE = mm512_ror_32( _mm512_xor_si512( VE, V3 ), 8 );
V9 = _mm512_add_epi32( V9, VE );
V4 = mm512_ror_32( _mm512_xor_si512( V4, V9 ), 7 );
// Remaining rounds
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
ROUND_S_16WAY( 4 );
ROUND_S_16WAY( 5 );
ROUND_S_16WAY( 6 );
ROUND_S_16WAY( 7 );
ROUND_S_16WAY( 8 );
ROUND_S_16WAY( 9 );
ROUND_S_16WAY( 0 );
ROUND_S_16WAY( 1 );
ROUND_S_16WAY( 2 );
ROUND_S_16WAY( 3 );
// Byte swap final hash
const __m512i shuf_bswap32 =
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
0x2c2d2e2f28292a2b, 0x2425262720212223,
0x1c1d1e1f18191a1b, 0x1415161710111213,
0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
}
#endif
// Blake-256 4 way
@@ -913,8 +1488,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
}
else
@@ -926,8 +1501,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
blake32_4way( ctx, buf, 64 );
}
@@ -1033,22 +1608,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
m256_const1_64( 0x0100000001000000ULL ) );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );
*(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
*(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
blake32_8way( sc, buf, 64 );
}
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
}
static void
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_8WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_8WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_8WAY_LE( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_8WAY(sc);
sc->ptr = ptr;
}
static void
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m256_const1_32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 52 )
{
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 );
*(buf+(56>>2)) = _mm256_set1_epi32( th );
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_one_32;
*(buf+(56>>2)) = _mm256_set1_epi32( th );
*(buf+(60>>2)) = _mm256_set1_epi32( tl );
blake32_8way_le( sc, buf, 64 );
}
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
}
@@ -1117,7 +1787,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
WRITE_STATE32_16WAY(sc);
sc->ptr = ptr;
}
static void
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
@@ -1152,22 +1821,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
if ( out_size_w32 == 8 )
buf[52>>2] = _mm512_or_si512( buf[52>>2],
m512_const1_64( 0x0100000001000000ULL ) );
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
blake32_16way( sc, buf, 64 );
}
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
}
static void
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_16WAY
buf = sc->buf;
ptr = sc->ptr;
// only if calling update with 80
if ( len < buf_size - ptr )
{
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_16WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = T0 + 512 ) < 512 )
T1 = T1 + 1;
COMPRESS32_16WAY_LE( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_16WAY(sc);
sc->ptr = ptr;
}
static void
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = m512_const1_32( 0x80000000 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
blake32_16way( sc, buf, 64 );
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFE00UL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 52 )
{
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 );
buf[56>>2] = _mm512_set1_epi32( th );
buf[60>>2] = _mm512_set1_epi32( tl );
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_512( buf, 56>>2 );
buf[52>>2] = m512_one_32;
buf[56>>2] = _mm512_set1_epi32( th );
buf[60>>2] = _mm512_set1_epi32( tl );
blake32_16way_le( sc, buf, 64 );
}
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
}
@@ -1190,6 +1953,18 @@ blake256_16way_close(void *cc, void *dst)
blake32_16way_close(cc, 0, 0, dst, 8);
}
void
blake256_16way_update_le(void *cc, const void *data, size_t len)
{
blake32_16way_le(cc, data, len);
}
void
blake256_16way_close_le(void *cc, void *dst)
{
blake32_16way_close_le(cc, 0, 0, dst, 8);
}
void blake256r14_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
@@ -1271,6 +2046,18 @@ blake256_8way_close(void *cc, void *dst)
blake32_8way_close(cc, 0, 0, dst, 8);
}
void
blake256_8way_update_le(void *cc, const void *data, size_t len)
{
blake32_8way_le(cc, data, len);
}
void
blake256_8way_close_le(void *cc, void *dst)
{
blake32_8way_close_le(cc, 0, 0, dst, 8);
}
#endif
// 14 rounds Blake, Decred

View File

@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
};
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define B2B8W_G(a, b, c, d, x, y) \

View File

@@ -361,14 +361,10 @@ static const sph_u64 CB[16] = {
V9 = m512_const1_64( CB1 ); \
VA = m512_const1_64( CB2 ); \
VB = m512_const1_64( CB3 ); \
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB4 ) ); \
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
m512_const1_64( CB5 ) ); \
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB6 ) ); \
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
m512_const1_64( CB7 ) ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB4 ) );
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
m512_const1_64( CB5 ) );
VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB6 ) );
VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
m512_const1_64( CB7 ) );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
0x28292a2b2c2d2e2f, 0x2021222324252627,
@@ -493,6 +485,307 @@ void blake512_8way_compress( blake_8way_big_context *sc )
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8way_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = sc->buf[ 9];
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
}
// with final_le forms a full hash in 2 parts from little endian data.
// all variables hard coded for 80 bytes/lane.
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data )
{
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_zero;
sc->buf[15] = m512_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m512_const1_64( CB0 );
V9 = m512_const1_64( CB1 );
VA = m512_const1_64( CB2 );
VB = m512_const1_64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
VF = _mm512_set1_epi64( CB7 );
// round 0
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// Do half of G4, skip the nonce
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
V0 = _mm512_add_epi64( V0, V5 );
GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
sc->buf[ 4] ) );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
V2 = _mm512_add_epi64( V2, V6 );
// G3
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
// pick up where we left off, need the nonce now.
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
__m512i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0 with the nonce now available
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
_mm512_set1_epi64( CB8 ), M9 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
// Round 1
// G0
GB_8WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
// G2
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
// G3
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
// G4, G5, G6, G7
GB_8WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_8WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
// remaining rounds
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
ROUND_B_8WAY(6);
ROUND_B_8WAY(7);
ROUND_B_8WAY(8);
ROUND_B_8WAY(9);
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
ROUND_B_8WAY(4);
ROUND_B_8WAY(5);
h[0] = mm512_xor3( V8, V0, sc->H[0] );
h[1] = mm512_xor3( V9, V1, sc->H[1] );
h[2] = mm512_xor3( VA, V2, sc->H[2] );
h[3] = mm512_xor3( VB, V3, sc->H[3] );
h[4] = mm512_xor3( VC, V4, sc->H[4] );
h[5] = mm512_xor3( VD, V5, sc->H[5] );
h[6] = mm512_xor3( VE, V6, sc->H[6] );
h[7] = mm512_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm512_block_bswap_64( (__m512i*)hash, h );
}
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
@@ -678,6 +971,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len )
{
// init
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
// update
memcpy_512( sc->buf, (__m512i*)data, len>>3 );
sc->ptr = len;
if ( len == 128 )
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
sc->ptr = 0;
}
// close
size_t ptr64 = sc->ptr >> 3;
unsigned bit_len;
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr64 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = m512_const1_64( th );
sc->buf[15] = m512_const1_64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
void
blake512_8way_update(void *cc, const void *data, size_t len)
{
@@ -741,14 +1101,10 @@ blake512_8way_close(void *cc, void *dst)
V9 = m256_const1_64( CB1 ); \
VA = m256_const1_64( CB2 ); \
VB = m256_const1_64( CB3 ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
m256_const1_64( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB7 ) ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -869,6 +1225,221 @@ void blake512_4way_compress( blake_4way_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[14] = m256_zero;
sc->buf[15] = m256_const1_64( 80*8 );
// build working variables
V0 = sc->H[0];
V1 = sc->H[1];
V2 = sc->H[2];
V3 = sc->H[3];
V4 = sc->H[4];
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = m256_const1_64( CB0 );
V9 = m256_const1_64( CB1 );
VA = m256_const1_64( CB2 );
VB = m256_const1_64( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
VF = _mm256_set1_epi64x( CB7 );
// round 0
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD );
GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE );
GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF );
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
V0 = _mm256_add_epi64( V0, V5 );
GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD );
GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE );
// round 1
// G1
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
sc->buf[ 4] ) );
// G2
V2 = _mm256_add_epi64( V2, V6 );
// G3
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
midstate[ 1] = V1;
midstate[ 2] = V2;
midstate[ 3] = V3;
midstate[ 4] = V4;
midstate[ 5] = V5;
midstate[ 6] = V6;
midstate[ 7] = V7;
midstate[ 8] = V8;
midstate[ 9] = V9;
midstate[10] = VA;
midstate[11] = VB;
midstate[12] = VC;
midstate[13] = VD;
midstate[14] = VE;
midstate[15] = VF;
}
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
__m256i h[8] __attribute__ ((aligned (64)));
// Load data with new nonce
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
M2 = sc->buf[ 2];
M3 = sc->buf[ 3];
M4 = sc->buf[ 4];
M5 = sc->buf[ 5];
M6 = sc->buf[ 6];
M7 = sc->buf[ 7];
M8 = sc->buf[ 8];
M9 = nonce;
MA = sc->buf[10];
MB = sc->buf[11];
MC = sc->buf[12];
MD = sc->buf[13];
ME = sc->buf[14];
MF = sc->buf[15];
V0 = midstate[ 0];
V1 = midstate[ 1];
V2 = midstate[ 2];
V3 = midstate[ 3];
V4 = midstate[ 4];
V5 = midstate[ 5];
V6 = midstate[ 6];
V7 = midstate[ 7];
V8 = midstate[ 8];
V9 = midstate[ 9];
VA = midstate[10];
VB = midstate[11];
VC = midstate[12];
VD = midstate[13];
VE = midstate[14];
VF = midstate[15];
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
// Round 1
// G0
GB_4WAY(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);
// G1
V1 = _mm256_add_epi64( V1, V5 );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
// G3
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
// G4, G5, G6, G7
GB_4WAY(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
GB_4WAY(Mx(1, A), Mx(1, B), CBx(1, A), CBx(1, B), V1, V6, VB, VC);
GB_4WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_4WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
ROUND_B_4WAY(6);
ROUND_B_4WAY(7);
ROUND_B_4WAY(8);
ROUND_B_4WAY(9);
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
ROUND_B_4WAY(2);
ROUND_B_4WAY(3);
ROUND_B_4WAY(4);
ROUND_B_4WAY(5);
h[0] = mm256_xor3( V8, V0, sc->H[0] );
h[1] = mm256_xor3( V9, V1, sc->H[1] );
h[2] = mm256_xor3( VA, V2, sc->H[2] );
h[3] = mm256_xor3( VB, V3, sc->H[3] );
h[4] = mm256_xor3( VC, V4, sc->H[4] );
h[5] = mm256_xor3( VD, V5, sc->H[5] );
h[6] = mm256_xor3( VE, V6, sc->H[6] );
h[7] = mm256_xor3( VF, V7, sc->H[7] );
// bswap final hash
mm256_block_bswap_64( (__m256i*)hash, h );
}
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );

View File

@@ -70,7 +70,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
#if !defined(min)
#define min(a,b) (a>b ? (b) :(a))
#endif
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{

View File

@@ -630,6 +630,69 @@ static const sph_u64 CB[16] = {
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#define COMPRESS32_LE do { \
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = S0 ^ CS0; \
V9 = S1 ^ CS1; \
VA = S2 ^ CS2; \
VB = S3 ^ CS3; \
VC = T0 ^ CS4; \
VD = T0 ^ CS5; \
VE = T1 ^ CS6; \
VF = T1 ^ CS7; \
M0 = *((uint32_t*)(buf + 0)); \
M1 = *((uint32_t*)(buf + 4)); \
M2 = *((uint32_t*)(buf + 8)); \
M3 = *((uint32_t*)(buf + 12)); \
M4 = *((uint32_t*)(buf + 16)); \
M5 = *((uint32_t*)(buf + 20)); \
M6 = *((uint32_t*)(buf + 24)); \
M7 = *((uint32_t*)(buf + 28)); \
M8 = *((uint32_t*)(buf + 32)); \
M9 = *((uint32_t*)(buf + 36)); \
MA = *((uint32_t*)(buf + 40)); \
MB = *((uint32_t*)(buf + 44)); \
MC = *((uint32_t*)(buf + 48)); \
MD = *((uint32_t*)(buf + 52)); \
ME = *((uint32_t*)(buf + 56)); \
MF = *((uint32_t*)(buf + 60)); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
ROUND_S(4); \
ROUND_S(5); \
ROUND_S(6); \
ROUND_S(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S(8); \
ROUND_S(9); \
ROUND_S(0); \
ROUND_S(1); \
ROUND_S(2); \
ROUND_S(3); \
} \
H0 ^= S0 ^ V0 ^ V8; \
H1 ^= S1 ^ V1 ^ V9; \
H2 ^= S2 ^ V2 ^ VA; \
H3 ^= S3 ^ V3 ^ VB; \
H4 ^= S0 ^ V4 ^ VC; \
H5 ^= S1 ^ V5 ^ VD; \
H6 ^= S2 ^ V6 ^ VE; \
H7 ^= S3 ^ V7 ^ VF; \
} while (0)
#endif
#if SPH_64
@@ -843,6 +906,45 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
sc->ptr = ptr;
}
static void
blake32_le(sph_blake_small_context *sc, const void *data, size_t len)
{
unsigned char *buf;
size_t ptr;
DECL_STATE32
buf = sc->buf;
ptr = sc->ptr;
if (len < (sizeof sc->buf) - ptr) {
memcpy(buf + ptr, data, len);
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32(sc);
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
ptr += clen;
data = (const unsigned char *)data + clen;
len -= clen;
if (ptr == sizeof sc->buf) {
if ((T0 = SPH_T32(T0 + 512)) < 512)
T1 = SPH_T32(T1 + 1);
COMPRESS32_LE;
ptr = 0;
}
}
WRITE_STATE32(sc);
sc->ptr = ptr;
}
static void
blake32_close(sph_blake_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
@@ -1050,6 +1152,12 @@ sph_blake256(void *cc, const void *data, size_t len)
blake32(cc, data, len);
}
void
sph_blake256_update_le(void *cc, const void *data, size_t len)
{
blake32_le(cc, data, len);
}
/* see sph_blake.h */
void
sph_blake256_close(void *cc, void *dst)

View File

@@ -198,6 +198,7 @@ void sph_blake256_init(void *cc);
* @param len the input data length (in bytes)
*/
void sph_blake256(void *cc, const void *data, size_t len);
void sph_blake256_update_le(void *cc, const void *data, size_t len);
/**
* Terminate the current BLAKE-256 computation and output the result into

View File

@@ -30,16 +30,10 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "algo/sha/sph_types.h"
#include "sph_blake2b.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
// Little-endian byte access.
#define B2B_GET64(p) \
@@ -54,45 +48,131 @@
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
#if defined(__AVX2__)
#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
{ \
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
BLAKE2B_G( R, 0, 2, 4, 6, 32, 24 ); \
BLAKE2B_G( R, 1, 3, 5, 7, 16, 63 ); \
V[3] = mm256_shufll_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shuflr_64( V[1] ); \
BLAKE2B_G( R, 8, 10, 12, 14, 32, 24 ); \
BLAKE2B_G( R, 9, 11, 13, 15, 16, 63 ); \
V[3] = mm256_shuflr_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
#elif defined(__SSSE3__)
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
V2 = mm128_shufl2r_64( V[2], V[3] ); \
V3 = mm128_shufl2r_64( V[3], V[2] ); \
V6 = mm128_shufl2l_64( V[6], V[7] ); \
V7 = mm128_shufl2l_64( V[7], V[6] ); \
BLAKE2B_G( R, V[0], V2, V[5], V6, 8, 10, 32, 24 ); \
BLAKE2B_G( R, V[0], V2, V[5], V6, 9, 11, 16, 63 ); \
BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
V[2] = mm128_shufl2l_64( V2, V3 ); \
V[3] = mm128_shufl2l_64( V3, V2 ); \
V[6] = mm128_shufl2r_64( V6, V7 ); \
V[7] = mm128_shufl2r_64( V7, V6 ); \
}
#else
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
{ \
Va = Va + Vb + m[ sigma[R][Sa] ]; \
Vd = ROTR64( Vd ^ Va, 32 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 24 ); \
Va = Va + Vb + m[ sigma[R][Sb] ]; \
Vd = ROTR64( Vd ^ Va, 16 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \
BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \
BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \
BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \
BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \
BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
}
#endif
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
uint64_t v[16] __attribute__ ((aligned (32)));
uint64_t m[16] __attribute__ ((aligned (32)));
int i;
for (i = 0; i < 8; i++) { // init work variables
v[i] = ctx->h[i];
@@ -106,16 +186,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
for (i = 0; i < 16; i++) // get little-endian words
m[i] = B2B_GET64(&ctx->b[8 * i]);
for (i = 0; i < 12; i++) { // twelve rounds
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for (i = 0; i < 12; i++)
BLAKE2B_ROUND( i );
for( i = 0; i < 8; ++i )
ctx->h[i] ^= v[i] ^ v[i + 8];

View File

@@ -594,9 +594,6 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
#define rb6(x) mm256_rol_64( x, 43 )
#define rb7(x) mm256_rol_64( x, 53 )
#define rol_off_64( M, j ) \
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
#define add_elt_b( mj0, mj3, mj10, h, K ) \
_mm256_xor_si256( h, _mm256_add_epi64( K, \
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -732,8 +729,23 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
__m256i mj[16];
for ( i = 0; i < 16; i++ )
mj[i] = rol_off_64( M, i );
mj[ 0] = mm256_rol_64( M[ 0], 1 );
mj[ 1] = mm256_rol_64( M[ 1], 2 );
mj[ 2] = mm256_rol_64( M[ 2], 3 );
mj[ 3] = mm256_rol_64( M[ 3], 4 );
mj[ 4] = mm256_rol_64( M[ 4], 5 );
mj[ 5] = mm256_rol_64( M[ 5], 6 );
mj[ 6] = mm256_rol_64( M[ 6], 7 );
mj[ 7] = mm256_rol_64( M[ 7], 8 );
mj[ 8] = mm256_rol_64( M[ 8], 9 );
mj[ 9] = mm256_rol_64( M[ 9], 10 );
mj[10] = mm256_rol_64( M[10], 11 );
mj[11] = mm256_rol_64( M[11], 12 );
mj[12] = mm256_rol_64( M[12], 13 );
mj[13] = mm256_rol_64( M[13], 14 );
mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 );
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
@@ -1034,9 +1046,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define r8b6(x) mm512_rol_64( x, 43 )
#define r8b7(x) mm512_rol_64( x, 53 )
#define rol8w_off_64( M, j ) \
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
_mm512_xor_si512( h, _mm512_add_epi64( K, \
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
@@ -1171,41 +1180,73 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
__m512i mj[16];
for ( i = 0; i < 16; i++ )
mj[i] = rol8w_off_64( M, i );
uint64_t K = 16 * 0x0555555555555555ULL;
mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 );
mj[ 3] = mm512_rol_64( M[ 3], 4 );
mj[ 4] = mm512_rol_64( M[ 4], 5 );
mj[ 5] = mm512_rol_64( M[ 5], 6 );
mj[ 6] = mm512_rol_64( M[ 6], 7 );
mj[ 7] = mm512_rol_64( M[ 7], 8 );
mj[ 8] = mm512_rol_64( M[ 8], 9 );
mj[ 9] = mm512_rol_64( M[ 9], 10 );
mj[10] = mm512_rol_64( M[10], 11 );
mj[11] = mm512_rol_64( M[11], 12 );
mj[12] = mm512_rol_64( M[12], 13 );
mj[13] = mm512_rol_64( M[13], 14 );
mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 );
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
K += 0x0555555555555555ULL;
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
(const __m512i)_mm512_set1_epi64( K ) );
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );

View File

@@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
x0 = mm512_rol_32( x2, 7 );
x1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( x0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
y0 = mm512_rol_32( x2, 7 );
y1 = mm512_rol_32( x3, 7 );
x2 = mm512_rol_32( x0, 7 );
x3 = mm512_rol_32( x1, 7 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( y1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap128_64( x4 );
@@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp )
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
x7 = _mm512_add_epi32( x3, x7 );
y0 = x0;
y1 = x2;
x0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( y0, 11 );
x2 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( y1, 11 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = mm512_rol_32( x1, 11 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( x3, 11 );
x3 = mm512_rol_32( x2, 11 );
x0 = _mm512_xor_si512( y0, x4 );
x1 = _mm512_xor_si512( x1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x2 = _mm512_xor_si512( y1, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap64_32( x4 );
x5 = mm512_swap64_32( x5 );
@@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp )
{
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
tx0 = x0;
ty0 = y0;
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
tx1 = x1;
ty1 = y1;
x0 = mm512_rol_32( x2, 7 );
y0 = mm512_rol_32( y2, 7 );
tx0 = mm512_rol_32( x2, 7 );
ty0 = mm512_rol_32( y2, 7 );
tx1 = mm512_rol_32( x3, 7 );
ty1 = mm512_rol_32( y3, 7 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
x1 = mm512_rol_32( x3, 7 );
y1 = mm512_rol_32( y3, 7 );
y6 = _mm512_add_epi32( y2, y6 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x2 = mm512_rol_32( tx0, 7 );
y2 = mm512_rol_32( ty0, 7 );
x0 = _mm512_xor_si512( x0, x4 );
y0 = _mm512_xor_si512( y0, y4 );
x2 = mm512_rol_32( x0, 7 );
y2 = mm512_rol_32( y0, 7 );
x3 = mm512_rol_32( x1, 7 );
y3 = mm512_rol_32( y1, 7 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( tx1, x5 );
y1 = _mm512_xor_si512( ty1, y5 );
x4 = mm512_swap128_64( x4 );
x3 = mm512_rol_32( tx1, 7 );
y3 = mm512_rol_32( ty1, 7 );
y4 = mm512_swap128_64( y4 );
x1 = _mm512_xor_si512( x1, x5 );
y1 = _mm512_xor_si512( y1, y5 );
x5 = mm512_swap128_64( x5 );
y5 = mm512_swap128_64( y5 );
x2 = _mm512_xor_si512( x2, x6 );
y2 = _mm512_xor_si512( y2, y6 );
y5 = mm512_swap128_64( y5 );
x3 = _mm512_xor_si512( x3, x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap128_64( x6 );
y6 = mm512_swap128_64( y6 );
x7 = mm512_swap128_64( x7 );
y7 = mm512_swap128_64( y7 );
x4 = _mm512_add_epi32( x0, x4 );
y4 = _mm512_add_epi32( y0, y4 );
y6 = mm512_swap128_64( y6 );
x5 = _mm512_add_epi32( x1, x5 );
y5 = _mm512_add_epi32( y1, y5 );
x7 = mm512_swap128_64( x7 );
tx0 = mm512_rol_32( x1, 11 );
ty0 = mm512_rol_32( y1, 11 );
tx1 = mm512_rol_32( x3, 11 );
ty1 = mm512_rol_32( y3, 11 );
x6 = _mm512_add_epi32( x2, x6 );
y6 = _mm512_add_epi32( y2, y6 );
tx0 = x0;
ty0 = y0;
y7 = mm512_swap128_64( y7 );
tx1 = x2;
ty1 = y2;
x0 = mm512_rol_32( x1, 11 );
y0 = mm512_rol_32( y1, 11 );
x7 = _mm512_add_epi32( x3, x7 );
y7 = _mm512_add_epi32( y3, y7 );
x1 = mm512_rol_32( tx0, 11 );
y1 = mm512_rol_32( ty0, 11 );
x0 = _mm512_xor_si512( x0, x4 );
x4 = mm512_swap64_32( x4 );
y0 = _mm512_xor_si512( y0, y4 );
x2 = mm512_rol_32( x3, 11 );
y4 = mm512_swap64_32( y4 );
y2 = mm512_rol_32( y3, 11 );
x1 = mm512_rol_32( x0, 11 );
y1 = mm512_rol_32( y0, 11 );
x3 = mm512_rol_32( x2, 11 );
y3 = mm512_rol_32( y2, 11 );
x0 = _mm512_xor_si512( tx0, x4 );
y0 = _mm512_xor_si512( ty0, y4 );
x1 = _mm512_xor_si512( x1, x5 );
x5 = mm512_swap64_32( x5 );
y1 = _mm512_xor_si512( y1, y5 );
x3 = mm512_rol_32( tx1, 11 );
x4 = mm512_swap64_32( x4 );
y4 = mm512_swap64_32( y4 );
x5 = mm512_swap64_32( x5 );
y5 = mm512_swap64_32( y5 );
y3 = mm512_rol_32( ty1, 11 );
x2 = _mm512_xor_si512( x2, x6 );
x6 = mm512_swap64_32( x6 );
y2 = _mm512_xor_si512( y2, y6 );
y6 = mm512_swap64_32( y6 );
x2 = _mm512_xor_si512( tx1, x6 );
y2 = _mm512_xor_si512( ty1, y6 );
x3 = _mm512_xor_si512( x3, x7 );
x7 = mm512_swap64_32( x7 );
y3 = _mm512_xor_si512( y3, y7 );
x6 = mm512_swap64_32( x6 );
y6 = mm512_swap64_32( y6 );
x7 = mm512_swap64_32( x7 );
y7 = mm512_swap64_32( y7 );
}
@@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
h[ 3] = m512_const1_128( iv[3] );
h[ 4] = m512_const1_128( iv[4] );
h[ 5] = m512_const1_128( iv[5] );
h[ 6] = m512_const1_128( iv[6] );
h[ 7] = m512_const1_128( iv[7] );
h[ 0] = m512_const1_128( iv[0] );
h[ 1] = m512_const1_128( iv[1] );
h[ 2] = m512_const1_128( iv[2] );
@@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x0;
y1 = x1;
ROL2( x0, x1, x2, x3, 7 );
ROL2( x2, x3, y0, y1, 7 );
x0 = _mm256_xor_si256( x0, x4 );
ROL2( y0, y1, x2, x3, 7 );
ROL2( x2, x3, x0, x1, 7 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( y1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x5 = mm256_swap128_64( x5 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = _mm256_add_epi32( x0, x4 );
x6 = mm256_swap128_64( x6 );
y0 = x0;
x5 = _mm256_add_epi32( x1, x5 );
x7 = mm256_swap128_64( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
y1 = x2;
ROL2( x0, x1, x1, y0, 11 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( x2, x3, x3, y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
x4 = mm256_swap64_32( x4 );
ROL2( y0, x1, x1, x0, 11 );
ROL2( y1, x3, x3, x2, 11 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x5 = mm256_swap64_32( x5 );
x2 = _mm256_xor_si256( x2, x6 );
x6 = mm256_swap64_32( x6 );
x2 = _mm256_xor_si256( y1, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
x6 = mm256_swap64_32( x6 );
x7 = mm256_swap64_32( x7 );
}
@@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
sp->rounds = rounds;
sp->pos = 0;
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 3] = m256_const1_128( iv[3] );
h[ 4] = m256_const1_128( iv[4] );
h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m256_const1_128( iv[6] );
h[ 7] = m256_const1_128( iv[7] );
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
@@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
return 0;
}
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
{
const int len = size >> 4;

View File

@@ -15,11 +15,11 @@
struct _cubehashParam
{
__m128i _ALIGN(64) x[8]; // aligned for __m512i
int hashlen; // __m128i
int rounds;
int blocksize; // __m128i
int pos; // number of __m128i read into x from current block
__m128i _ALIGN(64) x[8]; // aligned for __m256i
};
typedef struct _cubehashParam cubehashParam;

View File

@@ -156,14 +156,12 @@ int groestl512_full( hashState_groestl* ctx, void* output,
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE512 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE512;
__m128i* in = (__m128i*)input;
@@ -175,8 +173,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -227,12 +227,10 @@ int groestl256_full( hashState_groestl256* ctx,
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT256( ctx->chaining );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
int blocks = len / SIZE256;
__m128i* in = (__m128i*)input;
@@ -245,7 +243,7 @@ int groestl256_full( hashState_groestl256* ctx,
// cryptonight has 200 byte input, an odd number of __m128i
// remainder is only 8 bytes, ie u64.
if ( databitlen % 128 !=0 )
if ( databitlen % 128 != 0 )
{
// must be cryptonight, copy 64 bits of data
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
@@ -255,8 +253,8 @@ int groestl256_full( hashState_groestl256* ctx,
{
// Copy any remaining data to buffer for final transform
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
}
//--- final ---

View File

@@ -50,7 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m512i* in = (__m512i*)input;
int i;
@@ -67,7 +66,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -76,11 +74,10 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---
@@ -206,7 +203,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
const int len = (int)datalen >> 4;
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
uint64_t blocks = len / SIZE256;
__m256i* in = (__m256i*)input;
int i;
@@ -223,7 +219,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -232,11 +227,10 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
// copy any remaining data to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// use i as rem_ptr in final
//--- final ---

View File

@@ -99,7 +99,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
memset_zero_512( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -108,8 +107,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---
@@ -222,7 +220,6 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
memset_zero_256( ctx->buffer, SIZE512 );
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
// --- update ---
@@ -231,8 +228,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
ctx->buf_ptr = blocks * SIZE512;
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
i += ctx->rem_ptr;
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
// --- close ---

View File

@@ -545,21 +545,23 @@ static const sph_u32 T512[64][16] = {
#define sE c7
#define sF m7
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way AVX512
// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
// produces the same result but is faster.
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
// on i9-9940x cmplt with zero was 3% faster than movepi.
#define INPUT_BIG8 \
do { \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
const __m512i zero = m512_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
@@ -573,29 +575,6 @@ do { \
} \
} while (0)
/*
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
*/
#define SBOX8( a, b, c, d ) \
do { \
__m512i t; \
@@ -632,199 +611,192 @@ do { \
#define READ_STATE_BIG8(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG8(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
#define ROUND_BIG8( alpha ) \
do { \
__m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); \
sA = _mm512_xor_si512( sA, alpha[10] ); \
sB = _mm512_xor_si512( sB, alpha[11] ); \
sC = _mm512_xor_si512( sC, alpha[12] ); \
sD = _mm512_xor_si512( sD, alpha[13] ); \
sE = _mm512_xor_si512( sE, alpha[14] ); \
sF = _mm512_xor_si512( sF, alpha[15] ); \
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
\
SBOX8( s0, s4, s8, sC ); \
SBOX8( s1, s5, s9, sD ); \
SBOX8( s2, s6, sA, sE ); \
SBOX8( s3, s7, sB, sF ); \
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
_mm512_bslli_epi128( s5, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
_mm512_bslli_epi128( sE, 4 ) ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t1, s9, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( s6, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
_mm512_bslli_epi128( sF, 4 ) ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
L8( s1, t1, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
_mm512_bslli_epi128( s7, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
_mm512_bslli_epi128( sC, 4 ) ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t1, sB, t3 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
_mm512_bslli_epi128( s4, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
L8( s3, t1, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
_mm512_bslli_epi128( sB, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
L8( t0, t1, t2, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
} while (0)
#define P_BIG8 \
do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define PF_BIG8 \
do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
#define T_BIG8 \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
} while (0)
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
@@ -861,7 +833,6 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
{
sc->partial_len = 0;
@@ -911,11 +882,12 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
#define INPUT_BIG \
do { \
__m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 63; u >= 0; u-- ) \
{ \
__m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
@@ -933,7 +905,6 @@ do { \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \
tp += 8; \
db = _mm256_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -982,47 +953,28 @@ do { \
#define READ_STATE_BIG(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c0 = sc->h[0]; \
c1 = sc->h[1]; \
c2 = sc->h[2]; \
c3 = sc->h[3]; \
c4 = sc->h[4]; \
c5 = sc->h[5]; \
c6 = sc->h[6]; \
c7 = sc->h[7]; \
} while (0)
#define WRITE_STATE_BIG(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0] = c0; \
sc->h[1] = c1; \
sc->h[2] = c2; \
sc->h[3] = c3; \
sc->h[4] = c4; \
sc->h[5] = c5; \
sc->h[6] = c6; \
sc->h[7] = c7; \
} while (0)
/*
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
*/
#define ROUND_BIG( alpha ) \
do { \
__m256i t0, t1, t2, t3; \
@@ -1048,151 +1000,145 @@ do { \
SBOX( s2, s6, sA, sE ); \
SBOX( s3, s7, sB, sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \
t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \
L( s0, t1, s9, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \
s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \
sD = _mm256_blend_epi32( sD, t3, 0x55 ); \
sE = _mm256_blend_epi32( sE, t3, 0xaa ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
L( s1, t1, sA, t3 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \
sE = _mm256_blend_epi32( sE, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \
t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \
L( s2, t1, sB, t3 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \
s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \
sF = _mm256_blend_epi32( sF, t3, 0x55 ); \
sC = _mm256_blend_epi32( sC, t3, 0xaa ); \
s6 = mm256_swap64_32( s6 ); \
sF = mm256_swap64_32( sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
L( s3, t1, s8, t3 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \
s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \
sC = _mm256_blend_epi32( sC, t3, 0x55 ); \
sD = _mm256_blend_epi32( sD, t3, 0xaa ); \
s7 = mm256_swap64_32( s7 ); \
sC = mm256_swap64_32( sC ); \
\
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
t3 = mm256_swap64_32( t3 ); \
L( t0, t1, t2, t3 ); \
t3 = mm256_swap64_32( t3 ); \
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
\
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
L( t0, t1, t2, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
s4 = mm256_swap64_32( s4 ); \
s5 = mm256_swap64_32( s5 ); \
sD = mm256_swap64_32( sD ); \
sE = mm256_swap64_32( sE ); \
} while (0)
#define P_BIG \
do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_n )[0] ); \
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define PF_BIG \
do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)1 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)2 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)3 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)4 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)5 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)6 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)7 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)8 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)9 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)10 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( (uint64_t)11 << 32 ) \
^ ( (uint64_t*)alpha_f )[0] ); \
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
#define T_BIG \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
} while (0)
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )

View File

@@ -45,6 +45,6 @@ void sha512Compute32b_parallel(
uint64_t *data[SHA512_PARALLEL_N],
uint64_t *digest[SHA512_PARALLEL_N]);
void sha512ProcessBlock(Sha512Context *context);
void sha512ProcessBlock(Sha512Context contexti[2] );
#endif

View File

@@ -53,7 +53,8 @@ static const uint64_t RC[] = {
#define WRITE_STATE(sc)
#define MOV64(d, s) (d = s)
#define XOR64_IOTA XOR64
#define XOR64_IOTA XOR
#define LPAR (
#define RPAR )
@@ -71,14 +72,15 @@ static const uint64_t RC[] = {
// Targetted macros, keccak-macros.h is included for each target.
#define DECL64(x) __m512i x
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR(d, a, b) (d = _mm512_xor_si512(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
#define XOR3( d, a, b, c ) (d = mm512_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -236,6 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -243,7 +246,7 @@ keccak512_8way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX512
// AVX2
@@ -255,13 +258,15 @@ keccak512_8way_close(void *cc, void *dst)
} while (0)
#define DECL64(x) __m256i x
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR(d, a, b) (d = _mm256_xor_si256(a,b))
#define XOR64 XOR
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -421,6 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
#undef INPUT_BUF
#undef DECL64
#undef XOR64
#undef XOR
#undef AND64
#undef OR64
#undef NOT64
@@ -428,5 +434,6 @@ keccak512_4way_close(void *cc, void *dst)
#undef KECCAK_F_1600
#undef XOROR
#undef XORAND
#undef XOR3
#endif // AVX2

View File

@@ -1,6 +1,19 @@
#ifdef TH_ELT
#undef TH_ELT
#endif
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
XOR3( tt0, d0, d1, d4 ); \
XOR( tt1, d2, d3 ); \
XOR( tt0, tt0, tt1 ); \
ROL64( tt0, tt0, 1 ); \
XOR3( tt1, c0, c1, c4 ); \
XOR3( tt0, tt0, c2, c3 ); \
XOR( t, tt0, tt1 ); \
} while (0)
/*
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
@@ -17,7 +30,7 @@
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
*/
#ifdef THETA
#undef THETA
#endif

View File

@@ -13,8 +13,7 @@
#if defined (ALLIUM_16WAY)
typedef struct {
blake256_16way_context blake;
typedef union {
keccak256_8way_context keccak;
cube_4way_2buf_context cube;
skein256_8way_context skein;
@@ -25,41 +24,31 @@ typedef struct {
#endif
} allium_16way_ctx_holder;
static __thread allium_16way_ctx_holder allium_16way_ctx;
bool init_allium_16way_ctx()
{
keccak256_8way_init( &allium_16way_ctx.keccak );
skein256_8way_init( &allium_16way_ctx.skein );
return true;
}
void allium_16way_hash( void *state, const void *input )
static void allium_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -69,7 +58,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
// rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhashA);
keccak256_8way_init( &ctx.keccak );
@@ -152,6 +141,7 @@ void allium_16way_hash( void *state, const void *input )
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashA, 32 );
skein256_8way_close( &ctx.skein, vhashA );
skein256_8way_init( &ctx.skein );
@@ -199,6 +189,7 @@ void allium_16way_hash( void *state, const void *input )
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
#endif
}
@@ -206,35 +197,72 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
blake256_16way_init( &allium_16way_ctx.blake );
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces, add padding.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, vdata );
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, hash+(lane<<3), mythr );
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
@@ -244,8 +272,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
#elif defined (ALLIUM_8WAY)
typedef struct {
blake256_8way_context blake;
typedef union {
keccak256_4way_context keccak;
cube_2way_context cube;
skein256_4way_context skein;
@@ -256,19 +283,11 @@ typedef struct {
#endif
} allium_8way_ctx_holder;
static __thread allium_8way_ctx_holder allium_8way_ctx;
bool init_allium_8way_ctx()
{
keccak256_4way_init( &allium_8way_ctx.keccak );
skein256_4way_init( &allium_8way_ctx.skein );
return true;
}
void allium_8way_hash( void *hash, const void *input )
static void allium_8way_hash( void *hash, const void *midstate_vars,
const void *midhash, const void *block )
{
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
uint64_t vhashB[4*8] __attribute__ ((aligned (32)));
uint64_t *hash0 = (uint64_t*)hash;
uint64_t *hash1 = (uint64_t*)hash+ 4;
uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -279,15 +298,14 @@ void allium_8way_hash( void *hash, const void *input )
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhashA );
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
vhashA, 256 );
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
keccak256_4way_close( &ctx.keccak, vhashA );
keccak256_4way_init( &ctx.keccak );
@@ -306,7 +324,6 @@ void allium_8way_hash( void *hash, const void *input )
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
intrlv_2x128( vhashA, hash0, hash1, 256 );
intrlv_2x128( vhashB, hash2, hash3, 256 );
cube_2way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
@@ -333,6 +350,7 @@ void allium_8way_hash( void *hash, const void *input )
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashA, 32 );
skein256_4way_close( &ctx.skein, vhashA );
skein256_4way_init( &ctx.skein );
@@ -341,8 +359,8 @@ void allium_8way_hash( void *hash, const void *input )
#if defined(__VAES__)
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
uint64_t vhashC[4*2] __attribute__ ((aligned (32)));
uint64_t vhashD[4*2] __attribute__ ((aligned (32)));
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
@@ -377,36 +395,72 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_8way_init( &allium_8way_ctx.blake );
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, vdata );
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;

View File

@@ -132,11 +132,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
gate->hash = (void*)&lyra2z_16way_hash;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
@@ -175,13 +175,9 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_16WAY)
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
gate->scanhash = (void*)&scanhash_allium_16way;
gate->hash = (void*)&allium_16way_hash;
#elif defined (ALLIUM_8WAY)
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
gate->scanhash = (void*)&scanhash_allium_8way;
gate->hash = (void*)&allium_8way_hash;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;
gate->scanhash = (void*)&scanhash_allium;

View File

@@ -99,14 +99,14 @@ bool init_lyra2rev2_ctx();
#if defined(LYRA2Z_16WAY)
void lyra2z_16way_hash( void *state, const void *input );
//void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
//void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
@@ -163,17 +163,13 @@ bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_16WAY)
void allium_16way_hash( void *state, const void *input );
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_16way_ctx();
#elif defined(ALLIUM_8WAY)
void allium_8way_hash( void *state, const void *input );
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_8way_ctx();
#else

View File

@@ -14,42 +14,32 @@ bool lyra2z_16way_thread_init()
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_16way_context l2z_16way_blake_mid;
void lyra2z_16way_midstate( const void* input )
{
blake256_16way_init( &l2z_16way_blake_mid );
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
}
void lyra2z_16way_hash( void *state, const void *input )
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t hash8[8] __attribute__ ((aligned (32)));
uint32_t hash9[8] __attribute__ ((aligned (32)));
uint32_t hash10[8] __attribute__ ((aligned (32)));
uint32_t hash11[8] __attribute__ ((aligned (32)));
uint32_t hash12[8] __attribute__ ((aligned (32)));
uint32_t hash13[8] __attribute__ ((aligned (32)));
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
blake256_16way_close( &ctx_blake, vhash );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
@@ -97,40 +87,74 @@ void lyra2z_16way_hash( void *state, const void *input )
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
if ( bench ) ptarget[7] = 0x0000ff;
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
lyra2z_16way_midstate( vdata );
block_buf[ 4] = m512_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m512_zero;
block_buf[13] = m512_one_32;
block_buf[14] = m512_zero;
block_buf[15] = m512_const1_32( 80*8 );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr );
}
block_buf[ 3] = _mm512_add_epi32( block_buf[ 3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
@@ -145,30 +169,20 @@ bool lyra2z_8way_thread_init()
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -182,7 +196,6 @@ void lyra2z_8way_hash( void *state, const void *input )
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
@@ -197,43 +210,78 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
if ( bench ) ptarget[7] = 0x0000ff;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
lyra2z_8way_midstate( vdata );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces and add padding.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
block_buf[ 4] = m256_const1_32( 0x80000000 );
block_buf[ 5] =
block_buf[ 6] =
block_buf[ 7] =
block_buf[ 8] =
block_buf[ 9] =
block_buf[10] =
block_buf[11] =
block_buf[12] = m256_zero;
block_buf[13] = m256_one_32;
block_buf[14] = m256_zero;
block_buf[15] = m256_const1_32( 80*8 );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_8way_hash( hash, vdata );
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
for ( int lane = 0; lane < 8; lane++ )
{
for ( int lane = 0; lane < 8; lane++ )
{
const uint64_t *lane_hash = hash + (lane<<2);
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
}
n += 8;
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(LYRA2Z_4WAY)

View File

@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
// overlap it's unified.
// As a result normal is Nrows-2 / Nrows.
// for 4 rows: 1 unified, 2 overlap, 1 normal.
// for 8 rows: 1 unified, 2 overlap, 56 normal.
// for 8 rows: 1 unified, 2 overlap, 5 normal.
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
for ( i = 0; i < nCols; i++ )
{
//Absorbing "M[prev] [+] M[row*]"
io0 = _mm512_load_si512( inout0 );
io1 = _mm512_load_si512( inout0 +1 );
io2 = _mm512_load_si512( inout0 +2 );
io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1 );
io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
/*
io0 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
io2 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
*/
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
for ( i = 0; i < nCols; i++ )
{
//Absorbing "M[prev] [+] M[row*]"
io0.v512 = _mm512_load_si512( inout0 );
io1.v512 = _mm512_load_si512( inout0 +1 );
io2.v512 = _mm512_load_si512( inout0 +2 );
io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1 );
io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
/*
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
*/
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
/*
io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 ),
_mm512_load_si512( (__m512i*)inout1 ) );
io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +1 ),
_mm512_load_si512( (__m512i*)inout1 +1 ) );
io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
_mm512_load_si512( (__m512i*)inout0 +2 ),
_mm512_load_si512( (__m512i*)inout1 +2 ) );
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
*/
//Applies the reduced-round transformation f to the sponge's state
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
}
/*
if ( rowOut == rowInOut0 )
{
io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
}
if ( rowOut == rowInOut1 )
{
io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
}
*/
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
t0 = _mm512_permutex_epi64( state0, 0x93 );
t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
}
/*
casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
_mm512_mask_store_epi64( inout1, 0xf0, io0.v512 );
_mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
*/
casti_m256i( inout0, 0 ) = io0.v256lo;
casti_m256i( inout1, 1 ) = io0.v256hi;
casti_m256i( inout0, 2 ) = io1.v256lo;
casti_m256i( inout1, 3 ) = io1.v256hi;
casti_m256i( inout0, 4 ) = io2.v256lo;
casti_m256i( inout1, 5 ) = io2.v256hi;
/*
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );

View File

@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G_2X64( s1, s3, s5, s7 ); \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 );
mm128_vrol256_64( s2, s3 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -64,14 +64,14 @@ extern void hmq1725_8way_hash(void *state, const void *input)
uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t hash4 [16] __attribute__ ((aligned (64)));
uint32_t hash5 [16] __attribute__ ((aligned (64)));
uint32_t hash6 [16] __attribute__ ((aligned (64)));
uint32_t hash7 [16] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
uint32_t hash4 [16] __attribute__ ((aligned (32)));
uint32_t hash5 [16] __attribute__ ((aligned (32)));
uint32_t hash6 [16] __attribute__ ((aligned (32)));
uint32_t hash7 [16] __attribute__ ((aligned (32)));
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
__mmask8 vh_mask;
const __m512i vmask = m512_const1_64( 24 );
@@ -639,13 +639,13 @@ typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
extern void hmq1725_4way_hash(void *state, const void *input)
{
uint32_t hash0 [16] __attribute__ ((aligned (64)));
uint32_t hash1 [16] __attribute__ ((aligned (64)));
uint32_t hash2 [16] __attribute__ ((aligned (64)));
uint32_t hash3 [16] __attribute__ ((aligned (64)));
uint32_t vhash [16<<2] __attribute__ ((aligned (64)));
uint32_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint32_t vhashB[16<<2] __attribute__ ((aligned (64)));
uint32_t hash0 [16] __attribute__ ((aligned (32)));
uint32_t hash1 [16] __attribute__ ((aligned (32)));
uint32_t hash2 [16] __attribute__ ((aligned (32)));
uint32_t hash3 [16] __attribute__ ((aligned (32)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
int h_mask;

View File

@@ -35,6 +35,7 @@
#include "sph_ripemd.h"
#if 0
/*
* Round functions for RIPEMD (original).
*/
@@ -46,6 +47,7 @@ static const sph_u32 oIV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
};
#endif
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
@@ -63,6 +65,8 @@ static const sph_u32 IV[5] = {
#define ROTL SPH_ROTL32
#if 0
/* ===================================================================== */
/*
* RIPEMD (original hash, deprecated).
@@ -539,6 +543,8 @@ sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
#undef RIPEMD128_IN
}
#endif
/* ===================================================================== */
/*
* RIPEMD-160.

View File

@@ -84,6 +84,7 @@
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
#if 0
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
@@ -204,6 +205,8 @@ void sph_ripemd128_close(void *cc, void *dst);
*/
void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
#endif
/* ===================================================================== */
/**

View File

@@ -34,6 +34,7 @@
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#include <mm_malloc.h>
#include "malloc-huge.h"
static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
@@ -1487,11 +1488,19 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
bool scrypt_miner_thread_init( int thr_id )
{
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
scratchbuf = malloc_hugepages( scratchbuf_size );
if ( scratchbuf )
return true;
{
if ( opt_debug )
applog( LOG_NOTICE, "Thread %u is using huge pages", thr_id );
}
else
scratchbuf = _mm_malloc( scratchbuf_size, 128 );
if ( scratchbuf ) return true;
applog( LOG_ERR, "Thread %u: Scrypt buffer allocation failed", thr_id );
return false;
return false;
}
bool register_scrypt_algo( algo_gate_t* gate )

View File

@@ -62,8 +62,8 @@ extern "C"{
#if defined(__AVX2__)
#define DECL_STATE8 \
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m256i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -78,18 +78,18 @@ extern "C"{
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -126,18 +126,18 @@ extern "C"{
else \
{ \
(state)->state_loaded = true; \
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m256_const1_64( 0xE782B699E782B699 ); \
A03 = m256_const1_64( 0x5530463255304632 ); \
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m256_const1_64( 0xE782B699E782B699 ); \
A3 = m256_const1_64( 0x5530463255304632 ); \
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
@@ -176,18 +176,18 @@ extern "C"{
} while (0)
#define WRITE_STATE8(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -286,8 +286,8 @@ do { \
#define XOR_W8 \
do { \
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
A0 = _mm256_xor_si256( A0, _mm256_set1_epi32( Wlow ) ); \
A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
} while (0)
#define SWAP_BC8 \
@@ -321,60 +321,60 @@ do { \
} while (0)
#define PERM_STEP_0_8 do { \
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1_8 do { \
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2_8 do { \
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P8 \
@@ -398,42 +398,42 @@ do { \
PERM_STEP_0_8; \
PERM_STEP_1_8; \
PERM_STEP_2_8; \
A0B = _mm256_add_epi32( A0B, C6 ); \
A0A = _mm256_add_epi32( A0A, C5 ); \
A09 = _mm256_add_epi32( A09, C4 ); \
A08 = _mm256_add_epi32( A08, C3 ); \
A07 = _mm256_add_epi32( A07, C2 ); \
A06 = _mm256_add_epi32( A06, C1 ); \
A05 = _mm256_add_epi32( A05, C0 ); \
A04 = _mm256_add_epi32( A04, CF ); \
A03 = _mm256_add_epi32( A03, CE ); \
A02 = _mm256_add_epi32( A02, CD ); \
A01 = _mm256_add_epi32( A01, CC ); \
A00 = _mm256_add_epi32( A00, CB ); \
A0B = _mm256_add_epi32( A0B, CA ); \
A0A = _mm256_add_epi32( A0A, C9 ); \
A09 = _mm256_add_epi32( A09, C8 ); \
A08 = _mm256_add_epi32( A08, C7 ); \
A07 = _mm256_add_epi32( A07, C6 ); \
A06 = _mm256_add_epi32( A06, C5 ); \
A05 = _mm256_add_epi32( A05, C4 ); \
A04 = _mm256_add_epi32( A04, C3 ); \
A03 = _mm256_add_epi32( A03, C2 ); \
A02 = _mm256_add_epi32( A02, C1 ); \
A01 = _mm256_add_epi32( A01, C0 ); \
A00 = _mm256_add_epi32( A00, CF ); \
A0B = _mm256_add_epi32( A0B, CE ); \
A0A = _mm256_add_epi32( A0A, CD ); \
A09 = _mm256_add_epi32( A09, CC ); \
A08 = _mm256_add_epi32( A08, CB ); \
A07 = _mm256_add_epi32( A07, CA ); \
A06 = _mm256_add_epi32( A06, C9 ); \
A05 = _mm256_add_epi32( A05, C8 ); \
A04 = _mm256_add_epi32( A04, C7 ); \
A03 = _mm256_add_epi32( A03, C6 ); \
A02 = _mm256_add_epi32( A02, C5 ); \
A01 = _mm256_add_epi32( A01, C4 ); \
A00 = _mm256_add_epi32( A00, C3 ); \
AB = _mm256_add_epi32( AB, C6 ); \
AA = _mm256_add_epi32( AA, C5 ); \
A9 = _mm256_add_epi32( A9, C4 ); \
A8 = _mm256_add_epi32( A8, C3 ); \
A7 = _mm256_add_epi32( A7, C2 ); \
A6 = _mm256_add_epi32( A6, C1 ); \
A5 = _mm256_add_epi32( A5, C0 ); \
A4 = _mm256_add_epi32( A4, CF ); \
A3 = _mm256_add_epi32( A3, CE ); \
A2 = _mm256_add_epi32( A2, CD ); \
A1 = _mm256_add_epi32( A1, CC ); \
A0 = _mm256_add_epi32( A0, CB ); \
AB = _mm256_add_epi32( AB, CA ); \
AA = _mm256_add_epi32( AA, C9 ); \
A9 = _mm256_add_epi32( A9, C8 ); \
A8 = _mm256_add_epi32( A8, C7 ); \
A7 = _mm256_add_epi32( A7, C6 ); \
A6 = _mm256_add_epi32( A6, C5 ); \
A5 = _mm256_add_epi32( A5, C4 ); \
A4 = _mm256_add_epi32( A4, C3 ); \
A3 = _mm256_add_epi32( A3, C2 ); \
A2 = _mm256_add_epi32( A2, C1 ); \
A1 = _mm256_add_epi32( A1, C0 ); \
A0 = _mm256_add_epi32( A0, CF ); \
AB = _mm256_add_epi32( AB, CE ); \
AA = _mm256_add_epi32( AA, CD ); \
A9 = _mm256_add_epi32( A9, CC ); \
A8 = _mm256_add_epi32( A8, CB ); \
A7 = _mm256_add_epi32( A7, CA ); \
A6 = _mm256_add_epi32( A6, C9 ); \
A5 = _mm256_add_epi32( A5, C8 ); \
A4 = _mm256_add_epi32( A4, C7 ); \
A3 = _mm256_add_epi32( A3, C6 ); \
A2 = _mm256_add_epi32( A2, C5 ); \
A1 = _mm256_add_epi32( A1, C4 ); \
A0 = _mm256_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W8 do { \
@@ -660,8 +660,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
@@ -676,18 +676,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
A0 = (state)->A[0]; \
A1 = (state)->A[1]; \
A2 = (state)->A[2]; \
A3 = (state)->A[3]; \
A4 = (state)->A[4]; \
A5 = (state)->A[5]; \
A6 = (state)->A[6]; \
A7 = (state)->A[7]; \
A8 = (state)->A[8]; \
A9 = (state)->A[9]; \
AA = (state)->A[10]; \
AB = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
@@ -724,18 +724,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \
{ \
(state)->state_loaded = true; \
A00 = m128_const1_64( 0x20728DFD20728DFD ); \
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m128_const1_64( 0xE782B699E782B699 ); \
A03 = m128_const1_64( 0x5530463255304632 ); \
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m128_const1_64( 0x8BD144108BD14410 ); \
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m128_const1_64( 0xE782B699E782B699 ); \
A3 = m128_const1_64( 0x5530463255304632 ); \
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
@@ -774,18 +774,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
} while (0)
#define WRITE_STATE(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->A[0] = A0; \
(state)->A[1] = A1; \
(state)->A[2] = A2; \
(state)->A[3] = A3; \
(state)->A[4] = A4; \
(state)->A[5] = A5; \
(state)->A[6] = A6; \
(state)->A[7] = A7; \
(state)->A[8] = A8; \
(state)->A[9] = A9; \
(state)->A[10] = AA; \
(state)->A[11] = AB; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
@@ -884,8 +884,8 @@ do { \
#define XOR_W \
do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
A0 = _mm_xor_si128( A0, _mm_set1_epi32( Wlow ) ); \
A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
} while (0)
@@ -940,60 +940,60 @@ do { \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A3, A2, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A4, A3, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A5, A4, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A6, A5, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A7, A6, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A8, A7, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A9, A8, B9, B6, B2, BF, CF, M9); \
PERM_ELT(AA, A9, BA, B7, B3, B0, CE, MA); \
PERM_ELT(AB, AA, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A0, AB, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A7, A6, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A8, A7, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A9, A8, B5, B2, BE, BB, C3, M5); \
PERM_ELT(AA, A9, B6, B3, BF, BC, C2, M6); \
PERM_ELT(AB, AA, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A0, AB, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A1, A0, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A2, A1, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A3, A2, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A4, A3, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
PERM_ELT(AB, AA, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A0, AB, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A1, A0, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A2, A1, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A3, A2, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A4, A3, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A5, A4, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A6, A5, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A7, A6, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A8, A7, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P \
@@ -1017,42 +1017,42 @@ do { \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = _mm_add_epi32( A0B, C6 ); \
A0A = _mm_add_epi32( A0A, C5 ); \
A09 = _mm_add_epi32( A09, C4 ); \
A08 = _mm_add_epi32( A08, C3 ); \
A07 = _mm_add_epi32( A07, C2 ); \
A06 = _mm_add_epi32( A06, C1 ); \
A05 = _mm_add_epi32( A05, C0 ); \
A04 = _mm_add_epi32( A04, CF ); \
A03 = _mm_add_epi32( A03, CE ); \
A02 = _mm_add_epi32( A02, CD ); \
A01 = _mm_add_epi32( A01, CC ); \
A00 = _mm_add_epi32( A00, CB ); \
A0B = _mm_add_epi32( A0B, CA ); \
A0A = _mm_add_epi32( A0A, C9 ); \
A09 = _mm_add_epi32( A09, C8 ); \
A08 = _mm_add_epi32( A08, C7 ); \
A07 = _mm_add_epi32( A07, C6 ); \
A06 = _mm_add_epi32( A06, C5 ); \
A05 = _mm_add_epi32( A05, C4 ); \
A04 = _mm_add_epi32( A04, C3 ); \
A03 = _mm_add_epi32( A03, C2 ); \
A02 = _mm_add_epi32( A02, C1 ); \
A01 = _mm_add_epi32( A01, C0 ); \
A00 = _mm_add_epi32( A00, CF ); \
A0B = _mm_add_epi32( A0B, CE ); \
A0A = _mm_add_epi32( A0A, CD ); \
A09 = _mm_add_epi32( A09, CC ); \
A08 = _mm_add_epi32( A08, CB ); \
A07 = _mm_add_epi32( A07, CA ); \
A06 = _mm_add_epi32( A06, C9 ); \
A05 = _mm_add_epi32( A05, C8 ); \
A04 = _mm_add_epi32( A04, C7 ); \
A03 = _mm_add_epi32( A03, C6 ); \
A02 = _mm_add_epi32( A02, C5 ); \
A01 = _mm_add_epi32( A01, C4 ); \
A00 = _mm_add_epi32( A00, C3 ); \
AB = _mm_add_epi32( AB, C6 ); \
AA = _mm_add_epi32( AA, C5 ); \
A9 = _mm_add_epi32( A9, C4 ); \
A8 = _mm_add_epi32( A8, C3 ); \
A7 = _mm_add_epi32( A7, C2 ); \
A6 = _mm_add_epi32( A6, C1 ); \
A5 = _mm_add_epi32( A5, C0 ); \
A4 = _mm_add_epi32( A4, CF ); \
A3 = _mm_add_epi32( A3, CE ); \
A2 = _mm_add_epi32( A2, CD ); \
A1 = _mm_add_epi32( A1, CC ); \
A0 = _mm_add_epi32( A0, CB ); \
AB = _mm_add_epi32( AB, CA ); \
AA = _mm_add_epi32( AA, C9 ); \
A9 = _mm_add_epi32( A9, C8 ); \
A8 = _mm_add_epi32( A8, C7 ); \
A7 = _mm_add_epi32( A7, C6 ); \
A6 = _mm_add_epi32( A6, C5 ); \
A5 = _mm_add_epi32( A5, C4 ); \
A4 = _mm_add_epi32( A4, C3 ); \
A3 = _mm_add_epi32( A3, C2 ); \
A2 = _mm_add_epi32( A2, C1 ); \
A1 = _mm_add_epi32( A1, C0 ); \
A0 = _mm_add_epi32( A0, CF ); \
AB = _mm_add_epi32( AB, CE ); \
AA = _mm_add_epi32( AA, CD ); \
A9 = _mm_add_epi32( A9, CC ); \
A8 = _mm_add_epi32( A8, CB ); \
A7 = _mm_add_epi32( A7, CA ); \
A6 = _mm_add_epi32( A6, C9 ); \
A5 = _mm_add_epi32( A5, C8 ); \
A4 = _mm_add_epi32( A4, C7 ); \
A3 = _mm_add_epi32( A3, C6 ); \
A2 = _mm_add_epi32( A2, C5 ); \
A1 = _mm_add_epi32( A1, C4 ); \
A0 = _mm_add_epi32( A0, C3 ); \
} while (0)
#define INCR_W do { \

View File

@@ -18,10 +18,13 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
/*
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_shuflr128_32( a ), \
mm256_shuflr128_32( b ), 0x88 )
*/
//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
#if defined(__VAES__)
@@ -127,24 +130,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 2, 6, 10
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p2 = _mm256_xor_si256( p2, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p0 = _mm256_xor_si256( p0, x );
@@ -183,24 +186,24 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 4, 8, 12
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
k00 = _mm256_xor_si256( k00, _mm256_alignr_epi8( k13, k12, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
k01 = _mm256_xor_si256( k01, _mm256_alignr_epi8( k00, k13, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
k02 = _mm256_xor_si256( k02, _mm256_alignr_epi8( k01, k00, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
k03 = _mm256_xor_si256( k03, _mm256_alignr_epi8( k02, k01, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p0 = _mm256_xor_si256( p0, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
k10 = _mm256_xor_si256( k10, _mm256_alignr_epi8( k03, k02, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ), zero );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
k11 = _mm256_xor_si256( k11, _mm256_alignr_epi8( k10, k03, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
k12 = _mm256_xor_si256( k12, _mm256_alignr_epi8( k11, k10, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
k13 = _mm256_xor_si256( k13, _mm256_alignr_epi8( k12, k11, 4 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p2 = _mm256_xor_si256( p2, x );

View File

@@ -11,10 +11,6 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
#define mm512_ror2x512hi_1x32( a, b ) \
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
mm512_shuflr128_32( b ) )
static void
c512_4way( shavite512_4way_context *ctx, const void *msg )
{
@@ -106,24 +102,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 2, 6, 10
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
@@ -162,24 +158,24 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
// round 4, 8, 12
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );

View File

@@ -59,30 +59,6 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
// and return the rotated 128 bit vector a.
// a[3:0] = { b[0], a[3], a[2], a[1] }
#if defined(__SSSE3__)
#define mm128_ror256hi_1x32( a, b ) _mm_alignr_epi8( b, a, 4 )
#else // SSE2
#define mm128_ror256hi_1x32( a, b ) \
_mm_or_si128( _mm_srli_si128( a, 4 ), \
_mm_slli_si128( b, 12 ) )
#endif
/*
#if defined(__AVX2__)
// 2 way version of above
// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror256_1x32( a ), \
mm256_rol256_3x32( b ), 0x88 )
#endif
*/
static void
c512( sph_shavite_big_context *sc, const void *msg )
@@ -190,31 +166,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
@@ -262,31 +238,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );

View File

@@ -35,7 +35,7 @@
#include "sph_shavite.h"
#if !defined(__AES__)
#if !(defined(__AES__) && defined(__SSSE3__))
#ifdef __cplusplus
extern "C"{

View File

@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
//Don't call these directly from application code, use the macros below.
#ifdef __AES__
#if defined(__AES__) && defined(__SSSE3__)
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);

View File

@@ -74,7 +74,7 @@ typedef struct {
void sm3_init(sm3_ctx_t *ctx);
void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
void sm3_final(sm3_ctx_t *ctx, unsigned char *digest);
void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
void sm3(const unsigned char *data, size_t datalen,
unsigned char digest[SM3_DIGEST_LENGTH]);

View File

@@ -10,6 +10,7 @@
#include "algo-gate-api.h"
#include "Verthash.h"
#include "mm_malloc.h"
#include "malloc-huge.h"
//-----------------------------------------------------------------------------
// Verthash info management
@@ -84,10 +85,17 @@ int verthash_info_init(verthash_info_t* info, const char* file_name)
}
// Allocate data
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
if (!info->data)
info->data = (uint8_t *)malloc_hugepages( fileSize );
if ( info->data )
{
fclose(fileMiningData);
if ( !opt_quiet ) applog( LOG_INFO, "Verthash data is using huge pages");
}
else
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
if ( !info->data )
{
fclose( fileMiningData );
// Memory allocation fatal error.
return 2;
}

View File

@@ -29,16 +29,11 @@ void sha3_4way_keccakf( __m256i st[25] )
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
{
// Theta
bc[0] = _mm256_xor_si256( st[0],
mm256_xor4( st[5], st[10], st[15], st[20] ) );
bc[1] = _mm256_xor_si256( st[1],
mm256_xor4( st[6], st[11], st[16], st[21] ) );
bc[2] = _mm256_xor_si256( st[2],
mm256_xor4( st[7], st[12], st[17], st[22] ) );
bc[3] = _mm256_xor_si256( st[3],
mm256_xor4( st[8], st[13], st[18], st[23] ) );
bc[4] = _mm256_xor_si256( st[4],
mm256_xor4( st[9], st[14], st[19], st[24] ) );
bc[0] = mm256_xor3( st[0], st[5], mm256_xor3( st[10], st[15], st[20] ) );
bc[1] = mm256_xor3( st[1], st[6], mm256_xor3( st[11], st[16], st[21] ) );
bc[2] = mm256_xor3( st[2], st[7], mm256_xor3( st[12], st[17], st[22] ) );
bc[3] = mm256_xor3( st[3], st[8], mm256_xor3( st[13], st[18], st[23] ) );
bc[4] = mm256_xor3( st[4], st[9], mm256_xor3( st[14], st[19], st[24] ) );
for ( i = 0; i < 5; i++ )
{
@@ -89,17 +84,13 @@ void sha3_4way_keccakf( __m256i st[25] )
// Chi
for ( j = 0; j < 25; j += 5 )
{
memcpy( bc, &st[ j ], 5*32 );
st[ j ] = _mm256_xor_si256( st[ j ],
_mm256_andnot_si256( bc[1], bc[2] ) );
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
_mm256_andnot_si256( bc[2], bc[3] ) );
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
_mm256_andnot_si256( bc[3], bc[4] ) );
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
_mm256_andnot_si256( bc[4], bc[0] ) );
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
_mm256_andnot_si256( bc[0], bc[1] ) );
bc[0] = st[j];
bc[1] = st[j+1];
st[ j ] = mm256_xorandnot( st[ j ], st[j+1], st[j+2] );
st[ j+1 ] = mm256_xorandnot( st[ j+1 ], st[j+2], st[j+3] );
st[ j+2 ] = mm256_xorandnot( st[ j+2 ], st[j+3], st[j+4] );
st[ j+3 ] = mm256_xorandnot( st[ j+3 ], st[j+4], bc[0] );
st[ j+4 ] = mm256_xorandnot( st[ j+4 ], bc[0], bc[1] );
}
// Iota

View File

@@ -127,7 +127,7 @@ bool register_verthash_algo( algo_gate_t* gate )
{
opt_target_factor = 256.0;
gate->scanhash = (void*)&scanhash_verthash;
gate->optimizations = AVX2_OPT;
gate->optimizations = SSE42_OPT | AVX2_OPT;
const char *verthash_data_file = opt_data_file ? opt_data_file
: default_verthash_data_file;

View File

@@ -16,7 +16,8 @@
#if defined (X16R_8WAY)
// Perform midstate prehash of hash functions with block size <= 72 bytes.
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_prehash( void *vdata, void *pdata )
{
@@ -44,18 +45,36 @@ void x16r_8way_prehash( void *vdata, void *pdata )
skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
break;
case LUFFA:
{
hashState_luffa ctx_luffa;
mm128_bswap32_80( edata, pdata );
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
luffa_4way_init( &x16r_ctx.luffa, 512 );
luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
init_luffa( &ctx_luffa, 512 );
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
intrlv_4x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
ctx_luffa.buffer, ctx_luffa.buffer, ctx_luffa.buffer, 512 );
intrlv_4x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
ctx_luffa.chainv, ctx_luffa.chainv, ctx_luffa.chainv, 1280 );
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
}
break;
case CUBEHASH:
{
cubehashParam ctx_cube;
mm128_bswap32_80( edata, pdata );
intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
cube_4way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_4way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
cubehashInit( &ctx_cube, 512, 16, 32 );
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
x16r_ctx.cube.rounds = ctx_cube.rounds;
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
x16r_ctx.cube.pos = ctx_cube.pos;
intrlv_4x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, ctx_cube.x,
ctx_cube.x, 1024 );
}
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
@@ -94,14 +113,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t hash4[20] __attribute__ ((aligned (64)));
uint32_t hash5[20] __attribute__ ((aligned (64)));
uint32_t hash6[20] __attribute__ ((aligned (64)));
uint32_t hash7[20] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
uint32_t hash1[20] __attribute__ ((aligned (16)));
uint32_t hash2[20] __attribute__ ((aligned (16)));
uint32_t hash3[20] __attribute__ ((aligned (16)));
uint32_t hash4[20] __attribute__ ((aligned (16)));
uint32_t hash5[20] __attribute__ ((aligned (16)));
uint32_t hash6[20] __attribute__ ((aligned (16)));
uint32_t hash7[20] __attribute__ ((aligned (16)));
x16r_8way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -476,7 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -500,7 +519,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
@@ -552,18 +571,33 @@ void x16r_4way_prehash( void *vdata, void *pdata )
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
hashState_luffa ctx_luffa;
mm128_bswap32_80( edata, pdata );
intrlv_2x128( vdata2, edata, edata, 640 );
luffa_2way_init( &x16r_ctx.luffa, 512 );
luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
break;
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
init_luffa( &ctx_luffa, 512 );
update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
intrlv_2x128( x16r_ctx.luffa.buffer, ctx_luffa.buffer,
ctx_luffa.buffer, 512 );
intrlv_2x128( x16r_ctx.luffa.chainv, ctx_luffa.chainv,
ctx_luffa.chainv, 1280 );
x16r_ctx.luffa.hashbitlen = ctx_luffa.hashbitlen;
x16r_ctx.luffa.rembytes = ctx_luffa.rembytes;
}
break;
case CUBEHASH:
{
cubehashParam ctx_cube;
mm128_bswap32_80( edata, pdata );
intrlv_2x128( vdata2, edata, edata, 640 );
cube_2way_init( &x16r_ctx.cube, 512, 16, 32 );
cube_2way_update( &x16r_ctx.cube, vdata2, 64 );
rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
cubehashInit( &ctx_cube, 512, 16, 32 );
cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
x16r_ctx.cube.hashlen = ctx_cube.hashlen;
x16r_ctx.cube.rounds = ctx_cube.rounds;
x16r_ctx.cube.blocksize = ctx_cube.blocksize;
x16r_ctx.cube.pos = ctx_cube.pos;
intrlv_2x128( x16r_ctx.cube.h, ctx_cube.x, ctx_cube.x, 1024 );
}
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
@@ -596,10 +630,10 @@ void x16r_4way_prehash( void *vdata, void *pdata )
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16r_4way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -890,7 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -913,7 +947,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );

View File

@@ -24,15 +24,15 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, ntime, timeHash );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_8way_prehash( vdata, pdata );
@@ -78,15 +78,15 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, ntime, timeHash );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Nime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_4way_prehash( vdata, pdata );

View File

@@ -20,15 +20,15 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = swab32( pdata[17] );
if ( s_ntime != ntime )
uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = ntime;
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, ntime, timeHash );
x16r_hash_order, swab32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata );

View File

@@ -45,14 +45,14 @@ static __thread x16rv2_8way_context_overlay x16rv2_ctx;
int x16rv2_8way_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
uint32_t hash0[24] __attribute__ ((aligned (32)));
uint32_t hash1[24] __attribute__ ((aligned (32)));
uint32_t hash2[24] __attribute__ ((aligned (32)));
uint32_t hash3[24] __attribute__ ((aligned (32)));
uint32_t hash4[24] __attribute__ ((aligned (32)));
uint32_t hash5[24] __attribute__ ((aligned (32)));
uint32_t hash6[24] __attribute__ ((aligned (32)));
uint32_t hash7[24] __attribute__ ((aligned (32)));
x16rv2_8way_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -706,11 +706,11 @@ inline void padtiger512( uint32_t* hash )
int x16rv2_4way_hash( void* output, const void* input, int thrid )
{
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t vhash[20*4] __attribute__ ((aligned (64)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
uint32_t hash1[20] __attribute__ ((aligned (32)));
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16rv2_4way_context_overlay ctx;
memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
@@ -1054,8 +1054,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -1068,7 +1068,6 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );

View File

@@ -63,14 +63,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
uint64_t hash4[8] __attribute__ ((aligned (32)));
uint64_t hash5[8] __attribute__ ((aligned (32)));
uint64_t hash6[8] __attribute__ ((aligned (32)));
uint64_t hash7[8] __attribute__ ((aligned (32)));
sonoa_8way_context_overlay ctx;
// 1
@@ -1150,13 +1150,13 @@ typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
int sonoa_4way_hash( void *state, const void *input, int thr_id )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
sonoa_4way_context_overlay ctx;
// 1

View File

@@ -58,23 +58,27 @@ union _x17_8way_context_overlay
} __attribute__ ((aligned (64)));
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
int x17_8way_hash( void *state, const void *input, int thr_id )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
uint64_t hash4[8] __attribute__ ((aligned (32)));
uint64_t hash5[8] __attribute__ ((aligned (32)));
uint64_t hash6[8] __attribute__ ((aligned (32)));
uint64_t hash7[8] __attribute__ ((aligned (32)));
x17_8way_context_overlay ctx;
blake512_8way_full( &ctx.blake, vhash, input, 80 );
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
x17_8way_midstate );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 );
// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
#if defined(__VAES__)
shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
@@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
return 1;
}
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = &(hash32[7*8]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = mm512_intrlv_blend_32( *noncev,
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
do
{
if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X17_4WAY)
union _x17_4way_context_overlay
@@ -271,10 +327,10 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (32)));
uint64_t hash1[8] __attribute__ ((aligned (32)));
uint64_t hash2[8] __attribute__ ((aligned (32)));
uint64_t hash3[8] __attribute__ ((aligned (32)));
x17_4way_context_overlay ctx;
blake512_4way_full( &ctx.blake, vhash, input, 80 );

View File

@@ -3,7 +3,7 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_8WAY)
gate->scanhash = (void*)&scanhash_8way_64in_32out;
gate->scanhash = (void*)&scanhash_x17_8way;
gate->hash = (void*)&x17_8way_hash;
#elif defined (X17_4WAY)
gate->scanhash = (void*)&scanhash_4way_64in_32out;

View File

@@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_8WAY)
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x17_8way_hash( void *state, const void *input, int thr_id );
#elif defined(X17_4WAY)
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int x17_4way_hash( void *state, const void *input, int thr_id );
#endif

View File

@@ -62,14 +62,14 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t hash4[16] __attribute__ ((aligned (64)));
uint64_t hash5[16] __attribute__ ((aligned (64)));
uint64_t hash6[16] __attribute__ ((aligned (64)));
uint64_t hash7[16] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (32)));
uint64_t hash1[16] __attribute__ ((aligned (32)));
uint64_t hash2[16] __attribute__ ((aligned (32)));
uint64_t hash3[16] __attribute__ ((aligned (32)));
uint64_t hash4[16] __attribute__ ((aligned (32)));
uint64_t hash5[16] __attribute__ ((aligned (32)));
uint64_t hash6[16] __attribute__ ((aligned (32)));
uint64_t hash7[16] __attribute__ ((aligned (32)));
const int dataLen = 128;
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -430,13 +430,13 @@ typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
int xevan_4way_hash( void *output, const void *input, int thr_id )
{
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (32)));
uint64_t hash1[16] __attribute__ ((aligned (32)));
uint64_t hash2[16] __attribute__ ((aligned (32)));
uint64_t hash3[16] __attribute__ ((aligned (32)));
const int dataLen = 128;
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));

View File

@@ -21,7 +21,6 @@
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
#include "algo/shavite/shavite-hash-4way.h"

View File

@@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
AVX512_OPT | VAES_OPT;
InitializeSWIFFTX();
return true;
};

View File

@@ -5,6 +5,7 @@
#include "simd-utils.h"
#include <stdint.h>
#include <unistd.h>
#include "algo/swifftx/swifftx.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X22I_8WAY 1

View File

@@ -24,7 +24,6 @@
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
#include "algo/gost/sph_gost.h"
#include "algo/swifftx/swifftx.h"
#include "algo/panama/panama-hash-4way.h"
#include "algo/lanehash/lane.h"
#if defined(__VAES__)
@@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay
};
typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
int x25x_8way_hash( void *output, const void *input, int thrid )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
x25x_8way_midstate );
dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
hash4[0], hash5[0], hash6[0], hash7[0], vhash );
@@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
hash4[10], hash5[10], hash6[10], hash7[10] );
#else
init_echo( &ctx.echo, 512 );
@@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hashd7 = &(hash[7*8]);
uint32_t *pdata = work->data;
@@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const bool bench = opt_benchmark;
const __m512i eight = m512_const1_64( 8 );
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = mm512_intrlv_blend_32( *noncev,
_mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4,
0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x25x_8way_hash( hash, vdata, thr_id ) );
@@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay
panama_4way_context panama;
blake2s_4way_state blake2s;
};
typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
int x25x_4way_hash( void *output, const void *input, int thrid )
{
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
@@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_4way_full( &ctx.blake, vhash, input, 80 );
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
x25x_4way_midstate );
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
bmw512_4way_init( &ctx.bmw );
@@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *hashd7 = &(hash[ 7*4 ]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
mm256_intrlv80_4x64( vdata, edata );
*noncev = mm256_intrlv_blend_32( *noncev,
_mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) );
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
do
{
if ( x25x_4way_hash( hash, vdata, thr_id ) )
@@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
*noncev = _mm256_add_epi32( *noncev, four );
n += 4;
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -1,323 +0,0 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include <algo/yespower/crypto/sph_types.h>
#include "blake2b-yp.h"
// Cyclic right rotation.
//#ifndef ROTR64
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
//#endif
#define ROTR64(x, y) ror64( x, y )
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
// init work variables
for (i = 0; i < 8; i++) {
v[i] = ctx->h[i];
v[i + 8] = blake2b_iv[i];
}
v[12] ^= ctx->t[0]; // low 64 bits of offset
v[13] ^= ctx->t[1]; // high 64 bits
// last block flag set ?
if (last) {
v[14] = ~v[14];
}
// get little-endian words
for (i = 0; i < 16; i++) {
m[i] = B2B_GET64(&ctx->b[8 * i]);
}
// twelve rounds
for (i = 0; i < 12; i++) {
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for(i = 0; i < 8; ++i) {
ctx->h[i] ^= v[i] ^ v[i + 8];
}
}
// Initialize the hashing context "ctx" with optional key "key".
// 1 <= outlen <= 64 gives the digest size in bytes.
// Secret key (also <= 64 bytes) is optional (keylen = 0).
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
const void *key, size_t keylen) // (keylen=0: no key)
{
size_t i;
// illegal parameters
if (outlen == 0 || outlen > 64 || keylen > 64) {
return -1;
}
// state, "param block"
for (i = 0; i < 8; i++) {
ctx->h[i] = blake2b_iv[i];
}
ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
ctx->t[0] = 0; // input count low word
ctx->t[1] = 0; // input count high word
ctx->c = 0; // pointer within buffer
ctx->outlen = outlen;
// zero input block
for (i = keylen; i < 128; i++) {
ctx->b[i] = 0;
}
if (keylen > 0) {
blake2b_yp_update(ctx, key, keylen);
ctx->c = 128; // at the end
}
return 0;
}
// Add "inlen" bytes from "in" into the hash.
void blake2b_yp_update(blake2b_yp_ctx *ctx,
const void *in, size_t inlen) // data bytes
{
size_t i;
for (i = 0; i < inlen; i++) {
if (ctx->c == 128) { // buffer full ?
ctx->t[0] += ctx->c; // add counters
if (ctx->t[0] < ctx->c) // carry overflow ?
ctx->t[1]++; // high word
blake2b_compress(ctx, 0); // compress (not last)
ctx->c = 0; // counter to zero
}
ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
}
}
// Generate the message digest (size given in init).
// Result placed in "out".
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
{
size_t i;
ctx->t[0] += ctx->c; // mark last block offset
// carry overflow
if (ctx->t[0] < ctx->c) {
ctx->t[1]++; // high word
}
// fill up with zeros
while (ctx->c < 128) {
ctx->b[ctx->c++] = 0;
}
blake2b_compress(ctx, 1); // final block flag = 1
// little endian convert and store
for (i = 0; i < ctx->outlen; i++) {
((uint8_t *) out)[i] =
(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
}
}
// inlen = number of bytes
void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
blake2b_yp_ctx ctx;
blake2b_yp_init(&ctx, 32, NULL, 0);
blake2b_yp_update(&ctx, in, inlen);
blake2b_yp_final(&ctx, out);
}
// // keylen = number of bytes
void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64) {
blake2b_yp_hash(keyhash, key, keylen);
key = keyhash;
keylen = 32;
}
blake2b_yp_init(&hctx->inner, 32, NULL, 0);
memset(pad, 0x36, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->inner, pad, 64);
blake2b_yp_init(&hctx->outer, 32, NULL, 0);
memset(pad, 0x5c, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->outer, pad, 64);
memset(keyhash, 0, 32);
}
// datalen = number of bits
void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
// update the inner state
blake2b_yp_update(&hctx->inner, data, datalen);
}
void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
uint8_t ihash[32];
blake2b_yp_final(&hctx->inner, ihash);
blake2b_yp_update(&hctx->outer, ihash, 32);
blake2b_yp_final(&hctx->outer, digest);
memset(ihash, 0, 32);
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
hmac_yp_ctx hctx;
hmac_blake2b_yp_init(&hctx, key, keylen);
hmac_blake2b_yp_update(&hctx, in, inlen);
hmac_blake2b_yp_final(&hctx, out);
}
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
hmac_yp_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
hmac_blake2b_yp_update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
hmac_blake2b_yp_update(&hctx, &ivec, 4);
hmac_blake2b_yp_final(&hctx, U);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
hmac_blake2b_yp_update(&hctx, U, 32);
hmac_blake2b_yp_final(&hctx, U);
/* ... xor U_j ... */
for (k = 0; k < 32; k++) {
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32) {
clen = 32;
}
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
memset(&PShctx, 0, sizeof(hmac_yp_ctx));
}

View File

@@ -1,42 +0,0 @@
#pragma once
#ifndef __BLAKE2B_H__
#define __BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
// state context
typedef struct {
uint8_t b[128]; // input buffer
uint64_t h[8]; // chained state
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_yp_ctx;
typedef struct {
blake2b_yp_ctx inner;
blake2b_yp_ctx outer;
} hmac_yp_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
void blake2b_yp_hash(void *out, const void *in, size_t inlen);
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
#if defined(__cplusplus)
}
#endif
#endif

View File

@@ -0,0 +1,150 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "hmac-blake2b.h"
// keylen = number of bytes
void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
size_t keylen )
{
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64)
{
sph_blake2b_ctx ctx;
sph_blake2b_init( &ctx, 32, NULL, 0 );
sph_blake2b_update( &ctx, key, keylen );
sph_blake2b_final( &ctx, keyhash );
key = keyhash;
keylen = 32;
}
sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
memset( pad, 0x36, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->inner, pad, 64 );
sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
memset( pad, 0x5c, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->outer, pad, 64 );
memset( keyhash, 0, 32 );
}
// datalen = number of bits
void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
size_t datalen )
{
// update the inner state
sph_blake2b_update( &hctx->inner, data, datalen );
}
void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
{
uint8_t ihash[32];
sph_blake2b_final( &hctx->inner, ihash );
sph_blake2b_update( &hctx->outer, ihash, 32 );
sph_blake2b_final( &hctx->outer, digest );
memset( ihash, 0, 32 );
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen )
{
hmac_blake2b_ctx hctx;
hmac_blake2b_init( &hctx, key, keylen );
hmac_blake2b_update( &hctx, in, inlen );
hmac_blake2b_final( &hctx, out );
}
void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
const uint8_t *salt, size_t saltlen, uint64_t c,
uint8_t *buf, size_t dkLen )
{
hmac_blake2b_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_init( &PShctx, passwd, passwdlen );
hmac_blake2b_update( &PShctx, salt, saltlen );
/* Iterate through the blocks. */
for ( i = 0; i * 32 < dkLen; i++ )
{
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
hmac_blake2b_update( &hctx, &ivec, 4 );
hmac_blake2b_final( &hctx, U );
/* T_i = U_1 ... */
memcpy( T, U, 32 );
for ( j = 2; j <= c; j++ )
{
/* Compute U_j. */
hmac_blake2b_init( &hctx, passwd, passwdlen );
hmac_blake2b_update( &hctx, U, 32 );
hmac_blake2b_final( &hctx, U );
/* ... xor U_j ... */
for ( k = 0; k < 32; k++ )
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy( &buf[i * 32], T, clen );
}
/* Clean PShctx, since we never called _Final on it. */
memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
}

View File

@@ -0,0 +1,34 @@
#pragma once
#ifndef __HMAC_BLAKE2B_H__
#define __HMAC_BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
typedef struct
{
sph_blake2b_ctx inner;
sph_blake2b_ctx outer;
} hmac_blake2b_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen );
void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t c,
uint8_t * buf, size_t dkLen );
#if defined(__cplusplus)
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -95,7 +95,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "crypto/blake2b-yp.h"
#include "crypto/hmac-blake2b.h"
#include "yespower.h"
#ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
salsa20_blk_t *V, *XY;
pwxform_ctx_t ctx;
uint8_t init_hash[32];
sph_blake2b_ctx blake2b_ctx;
/* Sanity-check parameters */
if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
ctx.S0 = S;
ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
blake2b_yp_hash(init_hash, src, srclen);
sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
sph_blake2b_update( &blake2b_ctx, src, srclen );
sph_blake2b_final( &blake2b_ctx, init_hash );
ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
if ( work_restart[thrid].restart ) return false;
@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
/* Success! */
return 1;

View File

@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
gate->optimizations = SSE2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_yespower_b2b;
gate->hash = (void*)&yespower_b2b_hash;
opt_target_factor = 65536.0;

View File

@@ -36,8 +36,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
# AVX2 SHA AES: AMD Zen1
make clean || echo done
rm -f config.status
CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
#CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
#CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
make -j 8
strip -s cpuminer
mv cpuminer cpuminer-avx2-sha

10
build-msys2.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/bin/bash
#
# Compile on Windows using MSYS2 and MinGW.
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
make -j 4
strip -s cpuminer

48
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.19.2'
PACKAGE_STRING='cpuminer-opt 3.19.2'
PACKAGE_VERSION='3.20.1'
PACKAGE_STRING='cpuminer-opt 3.20.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.19.2
cpuminer-opt configure 3.20.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.19.2, which was
It was created by cpuminer-opt $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.19.2'
VERSION='3.20.1'
cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define USE_AVX512 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.19.2, which was
This file was extended by cpuminer-opt $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.19.2
cpuminer-opt config.status 3.20.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.19.2])
AC_INIT([cpuminer-opt], [3.20.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])

View File

@@ -105,8 +105,9 @@ bool opt_randomize = false;
static int opt_retries = -1;
static int opt_fail_pause = 10;
static int opt_time_limit = 0;
static unsigned int time_limit_stop = 0;
int opt_timeout = 300;
static int opt_scantime = 5;
static int opt_scantime = 0;
const int min_scantime = 1;
//static const bool opt_time = true;
enum algos opt_algo = ALGO_NULL;
@@ -127,6 +128,12 @@ char *short_url = NULL;
char *coinbase_address;
char *opt_data_file = NULL;
bool opt_verify = false;
static bool opt_stratum_keepalive = false;
static struct timeval stratum_keepalive_timer;
// Stratum typically times out in 5 minutes or 300 seconds
#define stratum_keepalive_timeout 180 // 3 minutes
static struct timeval stratum_reset_time;
// pk_buffer_size is used as a version selector by b58 code, therefore
// it must be set correctly to work.
@@ -187,7 +194,6 @@ int default_api_listen = 4048;
static struct timeval session_start;
static struct timeval five_min_start;
static uint64_t session_first_block = 0;
static double latency_sum = 0.;
static uint64_t submit_sum = 0;
static uint64_t accept_sum = 0;
static uint64_t stale_sum = 0;
@@ -336,6 +342,7 @@ void get_currentalgo(char* buf, int sz)
void proper_exit(int reason)
{
if (opt_debug) applog(LOG_INFO,"Program exit");
#ifdef WIN32
if (opt_background) {
HWND hcon = GetConsoleWindow();
@@ -1092,7 +1099,7 @@ void report_summary_log( bool force )
sprintf_et( et_str, et.tv_sec );
sprintf_et( upt_str, uptime.tv_sec );
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url );
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
submit_rate, safe_div( (double)submitted_share_count*60.,
@@ -1143,7 +1150,7 @@ void report_summary_log( bool force )
solved, solved_block_count );
}
if ( stratum_errors )
applog2( LOG_INFO, "Stratum errors %7d", stratum_errors );
applog2( LOG_INFO, "Stratum resets %7d", stratum_errors );
applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g",
highest_share, lowest_share );
@@ -1274,7 +1281,6 @@ static int share_result( int result, struct work *work,
else reject_sum++;
}
submit_sum++;
latency_sum += latency;
pthread_mutex_unlock( &stats_lock );
@@ -1290,10 +1296,11 @@ static int share_result( int result, struct work *work,
else rcol = CL_LRD;
}
applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
applog( LOG_INFO, "%d %s%s %s%s %s%s %s%s%s, %.3f sec (%dms)",
my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
bres, share_time, latency );
bres, CL_N, share_time, latency );
/*
if ( unlikely( opt_debug || !result || solved ) )
{
if ( have_stratum )
@@ -1303,14 +1310,27 @@ static int share_result( int result, struct work *work,
applog2( LOG_INFO, "Diff %.5g, Block %d",
my_stats.share_diff, work ? work->height : last_block_height );
}
*/
if ( unlikely( !( opt_quiet || result || stale ) ) )
{
uint32_t str[8];
uint32_t *targ;
// uint32_t str[8];
// uint32_t *targ;
if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
{
// The exact hash is not avaiable here, it's just an imprecise
// approximation calculated from the share difficulty. It's useless
// for anything other than low diff rejects. Until and unless a
// solution is implemented to make the hash and targets avaiable
// don't bother displaying them. In the meantime display the diff for
// low diff rejects.
if ( strstr( reason, "difficulty" ) )
applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
my_stats.share_diff, my_stats.target_diff );
/*
diff_to_hash( str, my_stats.share_diff );
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1324,6 +1344,8 @@ static int share_result( int result, struct work *work,
}
applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
*/
}
}
return 1;
}
@@ -2110,7 +2132,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
{
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
g_work->xnonce2_len );
applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
applog( LOG_INFO, "Extranonce2 0x%s, Block %d, Job %s",
xnonce2str, sctx->block_height, g_work->job_id );
free( xnonce2str );
}
@@ -2197,8 +2219,6 @@ static void *miner_thread( void *userdata )
// : 0;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
time_t firstwork_time = 0;
int i;
memset( &work, 0, sizeof(work) );
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
@@ -2242,7 +2262,7 @@ static void *miner_thread( void *userdata )
if ( !algo_gate.miner_thread_init( thr_id ) )
{
applog( LOG_ERR, "FAIL: thread %u failed to initialize", thr_id );
applog( LOG_ERR, "FAIL: thread %d failed to initialize", thr_id );
exit (1);
}
@@ -2270,22 +2290,34 @@ static void *miner_thread( void *userdata )
{
while ( unlikely( stratum_down ) )
sleep( 1 );
if ( *nonceptr >= end_nonce )
stratum_gen_work( &stratum, &g_work );
if ( unlikely( ( *nonceptr >= end_nonce )
&& !work_restart[thr_id].restart ) )
{
if ( opt_extranonce )
stratum_gen_work( &stratum, &g_work );
else
{
if ( !thr_id )
{
applog( LOG_WARNING, "nonce range exhausted, extranonce not subscribed" );
applog( LOG_WARNING, "waiting for new work...");
}
while ( !work_restart[thr_id].restart )
sleep ( 1 );
}
}
}
else
else if ( !opt_benchmark ) // GBT or getwork
{
pthread_rwlock_wrlock( &g_work_lock );
if ( ( ( time(NULL) - g_work_time )
>= ( have_longpoll ? LP_SCANTIME : opt_scantime ) )
if ( ( ( time(NULL) - g_work_time ) >= opt_scantime )
|| ( *nonceptr >= end_nonce ) )
{
if ( unlikely( !get_work( mythr, &g_work ) ) )
{
pthread_rwlock_unlock( &g_work_lock );
applog( LOG_ERR, "work retrieval failed, exiting "
"mining thread %d", thr_id );
applog( LOG_ERR, "work retrieval failed, exiting miner thread %d", thr_id );
goto out;
}
g_work_time = time(NULL);
@@ -2308,25 +2340,14 @@ static void *miner_thread( void *userdata )
if ( unlikely( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) )
continue;
// LP_SCANTIME overrides opt_scantime option, is this right?
// adjust max_nonce to meet target scan time. Stratum and longpoll
// can go longer because they can rely on restart_threads to signal
// an early abort. get_work on the other hand can't rely on
// restart_threads so need a much shorter scantime
if ( have_stratum )
max64 = 60 * thr_hashrates[thr_id];
else if ( have_longpoll )
max64 = LP_SCANTIME * thr_hashrates[thr_id];
else // getwork inline
max64 = opt_scantime * thr_hashrates[thr_id];
// opt_scantime expressed in hashes
max64 = opt_scantime * thr_hashrates[thr_id];
// time limit
if ( unlikely( opt_time_limit && firstwork_time ) )
if ( unlikely( opt_time_limit ) )
{
int passed = (int)( time(NULL) - firstwork_time );
int remain = (int)( opt_time_limit - passed );
if ( remain < 0 )
unsigned int now = (unsigned int)time(NULL);
if ( now >= time_limit_stop )
{
if ( thr_id != 0 )
{
@@ -2338,14 +2359,16 @@ static void *miner_thread( void *userdata )
char rate[32];
format_hashrate( global_hashrate, rate );
applog( LOG_NOTICE, "Benchmark: %s", rate );
fprintf(stderr, "%llu\n", (unsigned long long)global_hashrate);
}
else
applog( LOG_NOTICE,
"Mining timeout of %ds reached, exiting...", opt_time_limit);
proper_exit(0);
applog( LOG_NOTICE, "Mining timeout of %ds reached, exiting...",
opt_time_limit);
proper_exit(0);
}
if ( remain < max64 ) max64 = remain;
// else
if ( time_limit_stop - now < opt_scantime )
max64 = ( time_limit_stop - now ) * thr_hashrates[thr_id] ;
}
// Select nonce range based on max64, the estimated number of hashes
@@ -2361,8 +2384,6 @@ static void *miner_thread( void *userdata )
max_nonce = work_nonce + (uint32_t)max64;
// init time
if ( firstwork_time == 0 )
firstwork_time = time(NULL);
hashes_done = 0;
gettimeofday( (struct timeval *) &tv_start, NULL );
@@ -2435,7 +2456,7 @@ static void *miner_thread( void *userdata )
{
double hashrate = 0.;
pthread_mutex_lock( &stats_lock );
for ( i = 0; i < opt_n_threads; i++ )
for ( int i = 0; i < opt_n_threads; i++ )
hashrate += thr_hashrates[i];
global_hashrate = hashrate;
pthread_mutex_unlock( &stats_lock );
@@ -2729,6 +2750,18 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
sctx->job.final_sapling_hash );
}
// Loop is out of order:
//
// connect/reconnect
// handle message
// get new message
//
// change to
// connect/reconnect
// get new message
// handle message
static void *stratum_thread(void *userdata )
{
struct thr_info *mythr = (struct thr_info *) userdata;
@@ -2737,7 +2770,7 @@ static void *stratum_thread(void *userdata )
stratum.url = (char*) tq_pop(mythr->q, NULL);
if (!stratum.url)
goto out;
applog( LOG_BLUE, "Stratum connect %s", short_url );
applog( LOG_BLUE, "Stratum connect %s", stratum.url );
while (1)
{
@@ -2746,6 +2779,7 @@ static void *stratum_thread(void *userdata )
if ( unlikely( stratum_need_reset ) )
{
stratum_need_reset = false;
gettimeofday( &stratum_reset_time, NULL );
stratum_down = true;
stratum_errors++;
stratum_disconnect( &stratum );
@@ -2756,7 +2790,7 @@ static void *stratum_thread(void *userdata )
applog(LOG_BLUE, "Connection changed to %s", short_url);
}
else
applog(LOG_WARNING, "Stratum connection reset");
applog(LOG_BLUE, "Stratum connection reset");
// reset stats queue as well
restart_threads();
if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
@@ -2788,15 +2822,12 @@ static void *stratum_thread(void *userdata )
{
stratum_down = false;
applog(LOG_BLUE,"Stratum connection established" );
if ( stratum.new_job ) // prime first job
stratum_gen_work( &stratum, &g_work );
}
}
report_summary_log( ( stratum_diff != stratum.job.diff )
&& ( stratum_diff != 0. ) );
if ( stratum.new_job )
stratum_gen_work( &stratum, &g_work );
// Wait for new message from server
if ( likely( stratum_socket_full( &stratum, opt_timeout ) ) )
{
if ( likely( s = stratum_recv_line( &stratum ) ) )
@@ -2819,6 +2850,53 @@ static void *stratum_thread(void *userdata )
// stratum_disconnect( &stratum );
}
report_summary_log( ( stratum_diff != stratum.job.diff )
&& ( stratum_diff != 0. ) );
if ( !stratum_need_reset )
{
// Is keepalive needed? Mutex would normally be required but that
// would block any attempt to submit a share. A share is more
// important even if it messes up the keepalive.
if ( opt_stratum_keepalive )
{
struct timeval now, et;
gettimeofday( &now, NULL );
// any shares submitted since last keepalive?
if ( last_submit_time.tv_sec > stratum_keepalive_timer.tv_sec )
memcpy( &stratum_keepalive_timer, &last_submit_time,
sizeof (struct timeval) );
timeval_subtract( &et, &now, &stratum_keepalive_timer );
if ( et.tv_sec > stratum_keepalive_timeout )
{
double diff = stratum.job.diff * 0.5;
stratum_keepalive_timer = now;
if ( !opt_quiet )
applog( LOG_BLUE,
"Stratum keepalive requesting lower difficulty" );
stratum_suggest_difficulty( &stratum, diff );
}
if ( last_submit_time.tv_sec > stratum_reset_time.tv_sec )
timeval_subtract( &et, &now, &last_submit_time );
else
timeval_subtract( &et, &now, &stratum_reset_time );
if ( et.tv_sec > stratum_keepalive_timeout + 60 )
{
applog( LOG_NOTICE, "No shares submitted, resetting stratum connection" );
stratum_need_reset = true;
stratum_keepalive_timer = now;
}
} // stratum_keepalive
if ( stratum.new_job && !stratum_need_reset )
stratum_gen_work( &stratum, &g_work );
} // stratum_need_reset
} // loop
out:
return NULL;
@@ -2990,8 +3068,8 @@ static bool cpu_capability( bool display_only )
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes;
use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
use_sha || use_vaes );
use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
|| use_avx2 || use_sha || use_vaes );
// Display best options
printf( "\nStarting miner with" );
@@ -3273,6 +3351,7 @@ void parse_arg(int key, char *arg )
if ( strncasecmp( arg, "http://", 7 )
&& strncasecmp( arg, "https://", 8 )
&& strncasecmp( arg, "stratum+tcp://", 14 )
&& strncasecmp( arg, "stratum+ssl://", 14 )
&& strncasecmp( arg, "stratum+tcps://", 15 ) )
{
fprintf(stderr, "unknown protocol -- '%s'\n", arg);
@@ -3407,7 +3486,8 @@ void parse_arg(int key, char *arg )
break;
case 1021: // cpu-priority
v = atoi(arg);
if (v < 0 || v > 5) /* sanity check */
applog(LOG_NOTICE,"--cpu-priority is deprecated and will be removed from a future release");
if (v < 0 || v > 5) /* sanity check */
show_usage_and_exit(1);
opt_priority = v;
break;
@@ -3443,14 +3523,18 @@ void parse_arg(int key, char *arg )
break;
case 1024:
opt_randomize = true;
break;
applog(LOG_NOTICE,"--randomize is deprecated and will be removed from a future release");
break;
case 1027: // data-file
opt_data_file = strdup( arg );
break;
case 1028: // verify
opt_verify = true;
break;
case 'V':
case 1029: // stratum-keepalive
opt_stratum_keepalive = true;
break;
case 'V':
display_cpu_capability();
exit(0);
case 'h':
@@ -3625,6 +3709,17 @@ int main(int argc, char *argv[])
show_usage_and_exit(1);
}
if ( !opt_scantime )
{
if ( have_stratum ) opt_scantime = 30;
else if ( have_longpoll ) opt_scantime = LP_SCANTIME;
else opt_scantime = 5;
}
if ( opt_time_limit )
time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
// need to register to get algo optimizations for cpu capabilities
// but that causes registration logs before cpu capabilities is output.
// Would need to split register function into 2 parts. First part sets algo
@@ -3690,6 +3785,7 @@ int main(int argc, char *argv[])
flags = CURL_GLOBAL_ALL;
if ( !opt_benchmark )
if ( strncasecmp( rpc_url, "https:", 6 )
&& strncasecmp( rpc_url, "stratum+ssl://", 14 )
&& strncasecmp( rpc_url, "stratum+tcps://", 15 ) )
flags &= ~CURL_GLOBAL_SSL;
@@ -3833,6 +3929,8 @@ int main(int argc, char *argv[])
if ( opt_debug )
applog(LOG_INFO,"Creating stratum thread");
stratum.new_job = false; // just to make sure
/* init stratum thread info */
stratum_thr_id = opt_n_threads + 2;
thr = &thr_info[stratum_thr_id];
@@ -3899,6 +3997,8 @@ int main(int argc, char *argv[])
gettimeofday( &last_submit_time, NULL );
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
memcpy( &stratum_keepalive_timer, &last_submit_time, sizeof (struct timeval) );
memcpy( &stratum_reset_time, &last_submit_time, sizeof (struct timeval) );
memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
pthread_mutex_unlock( &stats_lock );

36
malloc-huge.c Normal file
View File

@@ -0,0 +1,36 @@
#include "malloc-huge.h"
#include "miner.h"
#define HUGEPAGE_SIZE_2M (2 * 1024 * 1024)
void *malloc_hugepages( size_t size )
{
#if !(defined(MAP_HUGETLB) && defined(MAP_ANON))
// applog( LOG_WARNING, "Huge pages not available",size);
return NULL;
#else
if ( size < HUGEPAGE_MIN_ALLOC )
{
// applog( LOG_WARNING, "Block too small for huge pages: %lu bytes",size);
return NULL;
}
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE_2M - 1;
void *p = NULL;
int flags =
#ifdef MAP_NOCORE
MAP_NOCORE |
#endif
MAP_HUGETLB | MAP_ANON | MAP_PRIVATE;
// round size up to next page boundary
size = ( size + hugepage_mask ) & (~hugepage_mask);
p = mmap( NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0 );
if ( p == MAP_FAILED )
p = NULL;
return p;
#endif
}

24
malloc-huge.h Normal file
View File

@@ -0,0 +1,24 @@
#if !(defined(MALLOC_HUGE__))
#define MALLOC_HUGE__
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#ifdef __unix__
#include <sys/mman.h>
#endif
#if defined(MAP_HUGETLB)
// Minimum block size 6 MiB to use huge pages
#define HUGEPAGE_MIN_ALLOC (6 * 1024 * 1024)
#endif
// Attempt to allocate memory backed by 2 MiB pages, returns NULL on failure.
void *malloc_hugepages( size_t size );
#endif

22
miner.h
View File

@@ -466,6 +466,7 @@ void stratum_disconnect(struct stratum_ctx *sctx);
bool stratum_subscribe(struct stratum_ctx *sctx);
bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff );
extern bool aes_ni_supported;
@@ -811,7 +812,7 @@ Options:\n\
lyra2z330 Lyra2 330 rows\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
minotaur Ringcoin (RNG)\n\
minotaur\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\
@@ -823,6 +824,7 @@ Options:\n\
qubit Qubit\n\
scrypt scrypt(1024, 1, 1) (default)\n\
scrypt:N scrypt(N, 1, 1)\n\
scryptn2 scrypt(1048576, 1,1)\n\
sha256d Double SHA-256\n\
sha256q Quad SHA-256, Pyrite (PYE)\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
@@ -885,10 +887,10 @@ Options:\n\
-T, --timeout=N timeout for long poll and stratum (default: 300 seconds)\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\
long polling is unavailable, in seconds (default: 5)\n\
--randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor=N Divide req. difficulty by this factor (std is 1.0)\n\
--randomize randomize scan range (deprecated)\n\
-f, --diff-factor=N divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier=N Multiply difficulty by this factor (std is 1.0)\n\
--hash-meter Display thread hash rates\n\
--hash-meter display thread hash rates\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
--no-longpoll disable long polling support\n\
@@ -909,15 +911,16 @@ Options:\n\
-B, --background run the miner in the background\n\
--benchmark run in offline benchmark mode\n\
--cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
--cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\
--cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest) (deprecated)\n\
-b, --api-bind=address[:port] IP address for the miner API, default port is 4048)\n\
--api-remote Allow remote control\n\
--max-temp=N Only mine if cpu temp is less than specified value (linux)\n\
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
--max-diff=N Only mine if net difficulty is less than specified value\n\
--api-remote allow remote control\n\
--max-temp=N only mine if cpu temp is less than specified value (linux)\n\
--max-rate=N[KMG] only mine if net hashrate is less than specified value\n\
--max-diff=N only mine if net difficulty is less than specified value\n\
-c, --config=FILE load a JSON-format configuration file\n\
--data-file=FILE path and name of data file\n\
--verify enable additional time consuming start up tests\n\
--stratum-keepalive prevent disconnects when difficulty is too high\n\
-V, --version display version and CPU information and exit\n\
-h, --help display this help text and exit\n\
";
@@ -987,6 +990,7 @@ static struct option const options[] = {
{ "userpass", 1, NULL, 'O' },
{ "data-file", 1, NULL, 1027 },
{ "verify", 0, NULL, 1028 },
{ "stratum-keepalive", 0, NULL, 1029 },
{ "version", 0, NULL, 'V' },
{ 0, 0, 0, 0 }
};

File diff suppressed because it is too large Load Diff

View File

@@ -272,9 +272,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
#define mm_movmask_64( v ) \
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
#define mm_movmask_32( v ) \
_mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
// Diagonal blend
// Blend 4 32 bit elements from 4 vectors
@@ -284,7 +294,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
_mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
#elif defined(__SSE4_1)
#elif defined(__SSE4_1__)
#define mm128_diagonal_32( v3, v2, v1, v0 ) \
mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
@@ -401,6 +411,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from src a, and the high half from src b.
#define mm128_shuffle2_64( a, b, c ) \
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
_mm_castsi128_pd( b ), c ) );
#define mm128_shuffle2_32( a, b, c ) \
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
_mm_castsi128_ps( b ), c ) );
//
// Rotate vector elements accross all lanes
@@ -525,16 +546,14 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
// Two input shuffle-rotate.
// Concatenate v1 & v2 and rotate as one 256 bit vector.
// Continue to use vror/vrol for now to avoid confusion with
// shufl2r/shufl2l function macros available with AVX512.
// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
#if defined(__SSSE3__)
// Function macro with two inputs and one output, inputs are preserved.
// Returns modified first arg.
// Two input functions are not available without SSSE3. Use procedure
// belowe instead.
// Function macros with two inputs and one output, inputs are preserved.
// Returns the high 128 bits, ie updated v1.
// These two-input functions are not available without SSSE3. Use procedure
// macros below instead.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
@@ -548,13 +567,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
// Returns both modified args in place.
// These macros retain the vrol/vror name for now to avoid
// confusion with the shufl2r/shuffle2l function macros above.
// These may be renamed to something like shufl2r2 for 2 1nputs and
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
// with existing code. The function macros above can be used more effciently.
#define mm128_vror256_64( v1, v2 ) \
do { \

View File

@@ -233,6 +233,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
#define mm256_movmask_64( v ) \
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
#define mm256_movmask_32( v ) \
_mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
// Diagonal blending
// Blend 4 64 bit elements from 4 vectors
@@ -405,6 +417,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements within each 128 bit lane of 256 bit vector.
// Limited 2 input shuffle
#define mm256_shuffle2_64( a, b, c ) \
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( a ), \
_mm256_castsi256_pd( b ), c ) );
#define mm256_shuffle2_32( a, b, c ) \
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
_mm256_castsi256_ps( b ), c ) );
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
@@ -420,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//
// Swap bytes in vector elements, endian bswap.
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
// AVX2 version will work here. The bswap control vector is coded to work
// with both versions, bit 4 is ignored in AVX2.
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
@@ -485,20 +513,6 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
v2 = _mm256_xor_si256( v1, v2 ); \
v1 = _mm256_xor_si256( v1, v2 );
#define mm256_vror512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \
} while(0)
#define mm256_vrol512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
v1 = t; \
} while(0)
#endif // __AVX2__
#endif // SIMD_256_H__

View File

@@ -15,13 +15,14 @@
// AVX512 intrinsics have a few changes from previous conventions.
//
// cmp instruction now returns a bitmask isnstead of a vector mask.
// cmp instruction now returns a bitmask instead of a vector mask.
// This eliminates the need for the blendv instruction.
//
// The new rotate instructions require the count to be an 8 bit
// immediate value only. Compilation fails if a variable is used.
// The documentation is the same as for shift and it works with
// variables.
// variables. The inconsistency is likely due to compiler optimizations
// that can eliminate the variable in some instances.
//
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
// usually shuffles accross all lanes.
@@ -317,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
// elements and can be called directly. But they only accept immediate 8
// for control arg.
// The workaround is a fraud, just a fluke of the compiler's optimizer.
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
// the variable control, better than rotate loops.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
@@ -429,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
//
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
//
// rename plan change ror to vror for Vector ROtate Right,
// and vrol for Vector ROtate Left, not to be confused with
//variable rotate rorv, rolv,
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
// Swap will still have its own mnemonic and will be aliased as both
// left and right shuffles.
// Shift elements right or left in 512 bit vector, filling with zeros.
// Multiple element shifts can be combined into a single larger
// element shift.
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
//
#define mm512_shiftr_256( v ) \
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
@@ -493,7 +485,7 @@ static inline __m512i mm512_shufll_32( const __m512i v )
static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
{ return _mm512_alignr_epi64( v, v, n ); }
static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
{ return _mm512_alignr_epi32( v, v, n ); }
#define mm512_shuflr_16( v ) \
@@ -529,7 +521,7 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
// 128 bit lane shift is handled by bslli bsrli.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
@@ -581,8 +573,19 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
0x0e0d0c0b0a090807, 0x060504030201001f ) )
//
// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
// destination elements must come from a specific source.
#define mm512_shuffle2_64( a, b, c ) \
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
_mm512_castsi512_pd( b ), c ) );
#define mm512_shuffle2_32( a, b, c ) \
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( a ), \
_mm512_castsi512_ps( b ), c ) );
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
@@ -610,11 +613,8 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
// rotated v1
// visually confusing for shif2r because of arg order. First arg is always
// the target for modification, either update by reference or by function
// return.
// 2 input, 1 output
// Rotate concatenated { v1, v2 ) right or left and return v1.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
@@ -627,76 +627,5 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
#define mm512_shufl2r_32( v1, v2 ) _mm512_alignr_epi32( v2, v1, 1 )
#define mm512_shufl2l_32( v1, v2 ) _mm512_alignr_epi32( v1, v2, 1 )
// Rotate elements from 2 512 bit vectors in place, source arguments
// are overwritten.
#define mm512_swap1024_512( v1, v2 ) \
v1 = _mm512_xor_si512( v1, v2 ); \
v2 = _mm512_xor_si512( v1, v2 ); \
v1 = _mm512_xor_si512( v1, v2 );
#define mm512_shufl2l_512 mm512_swap1024_512 \
#define mm512_shufl2r_512 mm512_swap1024_512 \
// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
// for now.
// Rotate elements from 2 512 bit vectors in place, both source arguments
// are updated.
#define mm512_vror1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm512_vrol1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
v1 = t; \
} while(0)
#define mm512_vror1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_vrol1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
v1 = t; \
} while(0)
#define mm512_vror1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_vrol1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
v1 = t; \
} while(0)
#define mm512_vror1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_vrol1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
v1 = t; \
} while(0)
#endif // AVX512
#endif // SIMD_512_H__

View File

@@ -209,7 +209,7 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
{
memset(outbuf, 0, maxsz);
#ifdef WIN32
char brand[0xC0] = { 0 };
char brand[256] = { 0 };
int output[4] = { 0 }, ext;
cpuid(0x80000000, output);
ext = output[0];
@@ -502,6 +502,28 @@ static inline bool has_vaes()
#endif
}
static inline bool has_vbmi()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
#endif
}
static inline bool has_vbmi2()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
#endif
}
// AMD only
static inline bool has_xop()
{

38
util.c
View File

@@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url)
free(sctx->url);
sctx->url = strdup(url);
}
free(sctx->curl_url);
free(sctx->curl_url);
sctx->curl_url = (char*) malloc(strlen(url));
sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
? strstr( url, "s://" )
: strstr (url, "://" ) );
// replace the stratum protocol prefix with http, https for ssl
sprintf( sctx->curl_url, "%s%s",
( strstr( url, "s://" ) || strstr( url, "ssl://" ) )
? "https" : "http", strstr( url, "://" ) );
// sprintf( sctx->curl_url, "http%s", strstr( url, "s://" )
// ? strstr( url, "s://" )
// : strstr (url, "://" ) );
if (opt_protocol)
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
@@ -1658,7 +1667,7 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
pthread_mutex_unlock(&sctx->work_lock);
if ( !opt_quiet ) /* pool dynamic change */
applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
applog( LOG_INFO, "Stratum extranonce1 0x%s, extranonce2 size %d",
xnonce1, xn2_size);
return true;
@@ -1846,6 +1855,25 @@ out:
return ret;
}
bool stratum_suggest_difficulty( struct stratum_ctx *sctx, double diff )
{
char *s;
s = (char*) malloc( 80 );
bool rc = true;
// response is handled seperately, what ID?
sprintf( s, "{\"id\": 1, \"method\": \"mining.suggest_difficulty\", \"params\": [\"%f\"]}", diff );
if ( !stratum_send_line( sctx, s ) )
{
applog(LOG_WARNING,"stratum.suggest_difficulty send failed");
rc = false;
}
free ( s );
return rc;
}
/**
* Extract bloc height L H... here len=3, height=0x1333e8
* "...0000000000ffffffff2703e83313062f503253482f043d61105408"

View File

@@ -16,8 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
# support for Windows CPU groups, AES sometimes not included in -march
export DEFAULT_CFLAGS="-O3 -maes -Wall -D_WIN32_WINNT=0x0601"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
export DEFAULT_CFLAGS="-maes -O3 -Wall -D_WIN32_WINNT=0x0601"
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
# make link to local gmp header file.
@@ -26,8 +26,8 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
# make release directory and copy selected DLLs.
rm -rf release > /dev/null
mkdir release
cp README.txt release/
cp README.md release/
cp RELEASE_NOTES release/
@@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
./clean-all.sh || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS
CFLAGS="-march=icelake-client $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
@@ -53,7 +53,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
# AVX512 AES: Intel Core HEDT Slylake-X, Cascadelake
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS
CFLAGS="-march=skylake-avx512 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx512.exe
@@ -61,7 +61,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
# AVX2 SHA VAES: Intel Alderlake, AMD Zen3
make clean || echo done
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -mavx2 -msha -mvaes" ./configure $CONFIGURE_ARGS
CFLAGS="-mavx2 -msha -mvaes $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
@@ -69,7 +69,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha-vaes.exe
# AVX2 AES SHA: AMD Zen1
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS
CFLAGS="-march=znver1 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2-sha.exe
@@ -77,7 +77,7 @@ mv cpuminer.exe release/cpuminer-avx2-sha.exe
# AVX2 AES: Intel Core Haswell, Skylake, Kabylake, Coffeelake, Cometlake
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS -march=core-avx2" ./configure $CONFIGURE_ARGS
CFLAGS="-march=core-avx2 $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx2.exe
@@ -85,7 +85,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe
# AVX AES: Intel Sandybridge, Ivybridge
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS
CFLAGS="-march=corei7-avx -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx.exe
@@ -93,7 +93,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
# SSE4.2 AES: Intel Westmere
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -march=westmere -maes" ./configure $CONFIGURE_ARGS
CFLAGS="-march=westmere -maes $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-sse42.exe
@@ -118,9 +118,16 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
# Generic SSE2
make clean || echo clean
rm -f config.status
CFLAGS="$DEFAULT_CFLAGS_OLD -msse2" ./configure $CONFIGURE_ARGS
CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe